diff --git a/.config/nextest.toml b/.config/nextest.toml new file mode 100644 index 00000000000..6f67aa5ecdb --- /dev/null +++ b/.config/nextest.toml @@ -0,0 +1,8 @@ +[profile.ci] +retries = 2 # Run at most 3 times +fail-fast = false +slow-timeout = { period = "60s", terminate-after = 2 } # Timeout 2m +failure-output = "final" + +[profile.ci.junit] +path = "junit.xml" diff --git a/CHANGELOG.md b/CHANGELOG.md index eb19c34a583..26fd52f2bd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # TiKV Change Log All notable changes to this project are documented in this file. -See also [TiDB Changelog](https://github.com/pingcap/tidb/blob/master/CHANGELOG.md) and [PD Changelog](https://github.com/pingcap/pd/blob/master/CHANGELOG.md). +See also [TiDB Release Notes](https://github.com/pingcap/docs/blob/master/releases/release-notes.md) and [PD Changelog](https://github.com/pingcap/pd/blob/master/CHANGELOG.md). ## [5.3.0] - 2021-11-29 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 85fcea3193e..41b2ef7a528 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,6 +19,7 @@ To build TiKV you'll need to at least have the following installed: * `make` - Build tool (run common workflows) * `cmake` - Build tool (required for gRPC) * `awk` - Pattern scanning/processing language +* [`protoc`](https://github.com/protocolbuffers/protobuf/releases) - Google protocol buffer compiler * C++ compiler - gcc 5+ (required for gRPC) If you are targeting platforms other than x86_64/aarch64 Linux or macOS, you'll also need: @@ -77,6 +78,12 @@ make test env EXTRA_CARGO_ARGS=$TESTNAME make test ``` +Alternatively, you can use [nextest](https://github.com/nextest-rs/nextest) to run tests: + +```bash +env EXTRA_CARGO_ARGS=$TESTNAME make test_with_nextest +``` + TiKV follows the Rust community coding style. We use Rustfmt and [Clippy](https://github.com/Manishearth/rust-clippy) to automatically format and lint our code. Using these tools is checked in our CI. These are as part of `make dev`, you can also run them alone: ```bash @@ -86,13 +93,13 @@ make format make clippy ``` -See the [style doc](https://github.com/rust-lang/rfcs/blob/master/style-guide/README.md) and the [API guidelines](https://rust-lang-nursery.github.io/api-guidelines/) for details on the conventions. +See the [style doc](https://github.com/rust-lang/fmt-rfcs/blob/master/guide/guide.md) and the [API guidelines](https://rust-lang-nursery.github.io/api-guidelines/) for details on the conventions. Please follow this style to make TiKV easy to review, maintain, and develop. ### Build issues -To reduce compilation time, TiKV builds do not include full debugging information by default — `release` and `bench` builds include no debuginfo; `dev` and `test` builds include full debug. To decrease compilation time with another ~5% (around 10 seconds for a 4 min build time), change the `debug = true` to `debug = 1` in the Cargo.toml file to only include line numbers for `dev` and `test`. Another way to change debuginfo is to precede build commands with `RUSTFLAGS=-Cdebuginfo=1` (for line numbers), or `RUSTFLAGS=-Cdebuginfo=2` (for full debuginfo). For example, +To reduce compilation time and disk usage, TiKV builds do not include full debugging information by default — only tests package will have line debug info enabled. To change debuginfo, just precede build commands with `RUSTFLAGS=-Cdebuginfo=1` (for line numbers), or `RUSTFLAGS=-Cdebuginfo=2` (for full debuginfo). For example, ```bash RUSTFLAGS=-Cdebuginfo=1 make dev @@ -109,13 +116,13 @@ To run TiKV as an actual key-value store, you will need to run it as a cluster ( Use [PD](https://github.com/tikv/pd) to manage the cluster (even if just one node on a single machine). -Instructions are in our [docs](https://tikv.org/docs/dev/tasks/deploy/binary/) (if you build TiKV from source, you could skip `1. Download package` and `tikv-server` is in directory `/target`). +Instructions are in our [docs](https://tikv.org/docs/latest/deploy/install/test/#install-binary-manually) (if you build TiKV from source, you could skip `1. Download package` and `tikv-server` is in directory `/target`). Tips: It's recommended to increase the open file limit above 82920. WSL2 users may refer to [the comment](https://github.com/Microsoft/WSL/issues/1688#issuecomment-532767317) if having difficulty in changing the `ulimit`. ### Configuration -Read our configuration guide to learn about various [configuration options](https://tikv.org/docs/dev/tasks/configure/introduction/). There is also a [configuration template](./etc/config-template.toml). +Read our configuration guide to learn about various [configuration options](https://tikv.org/docs/latest/deploy/configure/introduction/). There is also a [configuration template](./etc/config-template.toml). ## Contribution flow @@ -127,7 +134,7 @@ This is a rough outline of what a contributor's workflow looks like: - Write code, add test cases, and commit your work (see below for message format). - Run tests and make sure all tests pass. - Push your changes to a branch in your fork of the repository and submit a pull request. - * Make sure mention the issue, which is created at step 1, in the commit meesage. + * Make sure to mention the issue, which is created at step 1, in the commit message. - Your PR will be reviewed and may be requested some changes. * Once you've made changes, your PR must be re-reviewed and approved. * If the PR becomes out of date, you can use GitHub's 'update branch' button. diff --git a/Cargo.lock b/Cargo.lock index 96b637fdc43..7e5ea1bc862 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,7 +49,18 @@ checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" dependencies = [ "getrandom 0.2.3", "once_cell", - "version_check 0.9.2", + "version_check 0.9.4", +] + +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if 1.0.0", + "once_cell", + "version_check 0.9.4", ] [[package]] @@ -84,7 +95,8 @@ dependencies = [ "codec", "engine_traits", "kvproto", - "match_template", + "log_wrappers", + "match-template", "panic_hook", "thiserror", "tikv_alloc", @@ -130,7 +142,7 @@ dependencies = [ "lexical-core", "multiversion", "num 0.4.0", - "rand 0.8.3", + "rand 0.8.5", "regex", "serde", "serde_derive", @@ -148,6 +160,30 @@ dependencies = [ "futures-core", ] +[[package]] +name = "async-compression" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345fd392ab01f746c717b1357165b76f0b67a60192007b234058c9045fdcf695" +dependencies = [ + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-lock" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa24f727524730b077666307f2734b4a1a1c57acb79193127dcc8914d5242dd7" +dependencies = [ + "event-listener", +] + [[package]] name = "async-speed-limit" version = "0.4.0" @@ -202,23 +238,11 @@ dependencies = [ "syn", ] -[[package]] -name = "async-timer" -version = "1.0.0-beta.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d962799a5863fdf06fbf594e04102130582d010379137e9a98a7e2e693a5885" -dependencies = [ - "error-code", - "libc 0.2.125", - "wasm-bindgen", - "winapi 0.3.9", -] - [[package]] name = "async-trait" -version = "0.1.22" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8df72488e87761e772f14ae0c2480396810e51b2c2ade912f97f0f7e5b95e3c" +checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" dependencies = [ "proc-macro2", "quote", @@ -240,21 +264,22 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] [[package]] name = "autocfg" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws" version = "0.0.1" dependencies = [ "async-trait", + "base64 0.13.0", "bytes", "cloud", "fail", @@ -266,6 +291,7 @@ dependencies = [ "hyper-tls", "kvproto", "lazy_static", + "md5", "prometheus", "rusoto_core", "rusoto_credential", @@ -279,6 +305,52 @@ dependencies = [ "tikv_util", "tokio", "url", + "uuid 0.8.2", +] + +[[package]] +name = "axum" +version = "0.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acee9fd5073ab6b045a275b3e709c163dd36c90685219cb21804a147b58dba43" +dependencies = [ + "async-trait", + "axum-core", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa 1.0.1", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e5939e02c56fecd5c017c37df4238c0a839fa76b7f97acdd7efb804fd181cc" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "tower-layer", + "tower-service", ] [[package]] @@ -289,89 +361,113 @@ dependencies = [ "azure_core", "azure_identity", "azure_storage", - "base64", - "chrono", + "azure_storage_blobs", + "base64 0.13.0", "cloud", "futures 0.3.15", "futures-util", "kvproto", + "lazy_static", "oauth2", + "openssl", + "regex", + "serde", + "serde_json", "slog", "slog-global", "tikv_util", + "time 0.3.20", "tokio", "url", + "uuid 1.2.1", ] [[package]] name = "azure_core" -version = "0.1.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#b3c53f4cec4a6b541e49388b51e696dc892f18a3" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" dependencies = [ "async-trait", - "base64", + "base64 0.21.0", "bytes", - "chrono", "dyn-clone", "futures 0.3.15", "getrandom 0.2.3", - "http", + "http-types", "log", - "oauth2", - "rand 0.8.3", + "paste", + "pin-project", + "quick-xml 0.28.2", + "rand 0.8.5", "reqwest", "rustc_version 0.4.0", "serde", - "serde_derive", "serde_json", - "thiserror", + "time 0.3.20", "url", - "uuid", + "uuid 1.2.1", ] [[package]] name = "azure_identity" -version = "0.1.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#b3c53f4cec4a6b541e49388b51e696dc892f18a3" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" dependencies = [ - "async-timer", + "async-lock", "async-trait", "azure_core", - "chrono", + "fix-hidden-lifetime-bug", "futures 0.3.15", "log", "oauth2", - "reqwest", + "pin-project", "serde", "serde_json", - "thiserror", + "time 0.3.20", "url", + "uuid 1.2.1", ] [[package]] name = "azure_storage" -version = "0.1.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#b3c53f4cec4a6b541e49388b51e696dc892f18a3" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" dependencies = [ "RustyXML", "async-trait", "azure_core", - "base64", "bytes", - "chrono", "futures 0.3.15", - "http", + "hmac 0.12.1", "log", - "md5", "once_cell", - "ring", "serde", - "serde-xml-rs", "serde_derive", "serde_json", - "thiserror", + "sha2 0.10.6", + "time 0.3.20", + "url", + "uuid 1.2.1", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "bytes", + "futures 0.3.15", + "log", + "md5", + "serde", + "serde_derive", + "serde_json", + "time 0.3.20", "url", - "uuid", + "uuid 1.2.1", ] [[package]] @@ -383,7 +479,7 @@ dependencies = [ "addr2line", "cc", "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.139", "miniz_oxide 0.4.4", "object", "rustc-demangle", @@ -395,6 +491,8 @@ version = "0.0.1" dependencies = [ "api_version", "async-channel", + "aws", + "causal_ts", "collections", "concurrency_manager", "crc64fast", @@ -418,7 +516,7 @@ dependencies = [ "prometheus", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "security", "serde", "serde_derive", @@ -440,8 +538,10 @@ dependencies = [ name = "backup-stream" version = "0.1.0" dependencies = [ + "async-compression", "async-trait", "bytes", + "cfg-if 1.0.0", "chrono", "concurrency_manager", "crossbeam", @@ -457,38 +557,46 @@ dependencies = [ "fail", "file_system", "futures 0.3.15", + "futures-io", "grpcio", "hex 0.4.2", + "indexmap", "kvproto", "lazy_static", "log_wrappers", "online_config", "openssl", "pd_client", + "pin-project", "prometheus", + "prometheus-static-metric", "protobuf", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "regex", "resolved_ts", + "security", "slog", "slog-global", "tempdir", + "tempfile", + "test_pd", "test_raftstore", "test_util", "thiserror", "tidb_query_datatype", "tikv", "tikv_alloc", + "tikv_kv", "tikv_util", "tokio", "tokio-stream", - "tokio-util 0.7.2", + "tokio-util", "tonic", "txn_types", "url", - "uuid", + "uuid 0.8.2", "walkdir", "yatp", ] @@ -499,6 +607,12 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "base64" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" + [[package]] name = "batch-system" version = "0.1.0" @@ -509,9 +623,11 @@ dependencies = [ "derive_more", "fail", "file_system", + "kvproto", "lazy_static", "online_config", "prometheus", + "resource_control", "serde", "serde_derive", "slog", @@ -529,7 +645,7 @@ dependencies = [ "bcc-sys", "bitflags", "byteorder", - "libc 0.2.125", + "libc 0.2.139", "regex", "thiserror", ] @@ -569,7 +685,7 @@ dependencies = [ "cexpr 0.6.0", "clang-sys", "clap 2.33.0", - "env_logger", + "env_logger 0.9.0", "lazy_static", "lazycell", "log", @@ -609,6 +725,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "boolinator" version = "2.4.0" @@ -633,6 +758,12 @@ version = "3.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12ae9db68ad7fac5fe51304d20f016c911539251075a214f8e663babefa35187" +[[package]] +name = "bytemuck" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" + [[package]] name = "byteorder" version = "1.3.4" @@ -655,7 +786,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.139", "pkg-config", ] @@ -681,7 +812,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7f788eaf239475a3c1e1acf89951255a46c4b9b46cf3e866fc4d0707b4b9e36" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "valgrind_request", ] @@ -712,24 +843,27 @@ name = "causal_ts" version = "0.0.1" dependencies = [ "api_version", + "async-trait", + "criterion", "engine_rocks", "engine_traits", + "enum_dispatch", "error_code", "fail", "futures 0.3.15", "kvproto", "lazy_static", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", + "prometheus-static-metric", "raft", - "raftstore", "serde", "serde_derive", "slog", "slog-global", - "test_raftstore", + "test_pd_client", "thiserror", "tikv_alloc", "tikv_util", @@ -739,9 +873,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.69" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" dependencies = [ "jobserver", ] @@ -752,6 +886,7 @@ version = "0.0.1" dependencies = [ "api_version", "bitflags", + "causal_ts", "collections", "concurrency_manager", "criterion", @@ -780,6 +915,7 @@ dependencies = [ "slog", "slog-global", "tempfile", + "test_pd_client", "test_raftstore", "test_util", "thiserror", @@ -829,7 +965,7 @@ dependencies = [ "num-integer", "num-traits", "serde", - "time", + "time 0.1.42", ] [[package]] @@ -849,7 +985,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f54d78e30b388d4815220c8dd03fea5656b6c6d32adb59e89061552a102f8da1" dependencies = [ "glob", - "libc 0.2.125", + "libc 0.2.139", "libloading", ] @@ -920,9 +1056,8 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb6210b637171dfba4cda12e579ac6dc73f5165ad56133e5d72ef3131f320855" +version = "0.1.48" +source = "git+https://github.com/rust-lang/cmake-rs#00e6b220342a8b0ec4548071928ade38fd5f691b" dependencies = [ "cc", ] @@ -934,10 +1069,10 @@ dependencies = [ "byteorder", "bytes", "error_code", - "libc 0.2.125", + "libc 0.2.139", "panic_hook", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "static_assertions", "thiserror", "tikv_alloc", @@ -960,8 +1095,8 @@ dependencies = [ "fail", "futures 0.3.15", "kvproto", - "parking_lot 0.12.0", - "rand 0.8.3", + "parking_lot 0.12.1", + "rand 0.8.5", "tikv_alloc", "tikv_util", "tokio", @@ -993,7 +1128,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" dependencies = [ "core-foundation-sys", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -1008,10 +1143,19 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] +[[package]] +name = "cpufeatures" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" +dependencies = [ + "libc 0.2.139", +] + [[package]] name = "cpuid-bool" version = "0.1.2" @@ -1044,7 +1188,7 @@ dependencies = [ "clap 2.33.0", "criterion-plot", "csv", - "itertools 0.10.0", + "itertools", "lazy_static", "num-traits", "oorandom", @@ -1066,7 +1210,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63aaaf47e457badbcb376c65a49d0f182c317ebd97dc6d1ced94c8e1d09c0f3a" dependencies = [ "criterion", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -1086,87 +1230,77 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ "cast", - "itertools 0.10.0", + "itertools", ] [[package]] name = "crossbeam" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd01a6eb3daaafa260f6fc94c3a6c36390abc2080e38e3e34ced87393fb77d80" +checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" dependencies = [ "cfg-if 1.0.0", "crossbeam-channel", "crossbeam-deque", - "crossbeam-epoch 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch", "crossbeam-queue", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", ] [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", ] [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch", + "crossbeam-utils 0.8.8", ] [[package]] name = "crossbeam-epoch" -version = "0.9.3" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static", - "memoffset", - "scopeguard", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ + "autocfg", "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0)", + "crossbeam-utils 0.8.8", "lazy_static", - "memoffset", + "memoffset 0.6.4", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.1" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f6cb3c7f5b8e51bc3ebb73a2327ad4abdbd119dc13223f14f961d2f38486756" +checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", ] [[package]] name = "crossbeam-skiplist" -version = "0.0.0" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3 (git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0)", - "crossbeam-utils 0.8.3 (git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0)", + "crossbeam-epoch", + "crossbeam-utils 0.8.8", "scopeguard", ] @@ -1183,23 +1317,22 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.3" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" dependencies = [ - "autocfg", "cfg-if 1.0.0", "lazy_static", ] [[package]] -name = "crossbeam-utils" -version = "0.8.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "lazy_static", + "generic-array", + "typenum", ] [[package]] @@ -1277,16 +1410,16 @@ checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" dependencies = [ "cfg-if 1.0.0", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", ] [[package]] name = "debugid" -version = "0.7.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91cf5a8c2f2097e2a32627123508635d47ce10563d999ec1a95addf08b502ba" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" dependencies = [ - "uuid", + "uuid 1.2.1", ] [[package]] @@ -1320,6 +1453,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "digest" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common", + "subtle", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -1336,17 +1480,11 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "redox_users", "winapi 0.3.9", ] -[[package]] -name = "doc-comment" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "923dea538cea0aa3025e8685b20d6ee21ef99c4f77e954a30febbaac5ec73a97" - [[package]] name = "dyn-clone" version = "1.0.4" @@ -1400,7 +1538,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -1412,6 +1550,7 @@ dependencies = [ "tikv_util", "tokio", "toml", + "walkdir", ] [[package]] @@ -1444,6 +1583,7 @@ dependencies = [ "raft", "tikv_alloc", "tikv_util", + "tracker", "txn_types", ] @@ -1469,7 +1609,7 @@ dependencies = [ "prometheus-static-metric", "protobuf", "raft", - "rand 0.8.3", + "rand 0.8.5", "regex", "rocksdb", "serde", @@ -1480,8 +1620,9 @@ dependencies = [ "tempfile", "tikv_alloc", "tikv_util", - "time", + "time 0.1.42", "toml", + "tracker", "txn_types", ] @@ -1491,6 +1632,7 @@ version = "0.1.0" dependencies = [ "engine_rocks", "engine_test", + "engine_traits", "fail", "futures 0.3.15", "keys", @@ -1510,6 +1652,7 @@ dependencies = [ name = "engine_test" version = "0.0.1" dependencies = [ + "collections", "encryption", "engine_panic", "engine_rocks", @@ -1526,10 +1669,13 @@ name = "engine_traits" version = "0.0.1" dependencies = [ "case_macros", + "collections", "error_code", "fail", "file_system", + "keys", "kvproto", + "lazy_static", "log_wrappers", "protobuf", "raft", @@ -1541,6 +1687,7 @@ dependencies = [ "tikv_alloc", "tikv_util", "toml", + "tracker", "txn_types", ] @@ -1548,13 +1695,29 @@ dependencies = [ name = "engine_traits_tests" version = "0.0.1" dependencies = [ + "encryption", + "encryption_export", "engine_test", "engine_traits", + "kvproto", "panic_hook", "tempfile", + "test_util", "tikv_alloc", ] +[[package]] +name = "enum_dispatch" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eb359f1476bf611266ac1f5355bc14aeca37b299d0ebccc038ee7058891c9cb" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "env_logger" version = "0.9.0" @@ -1569,23 +1732,47 @@ dependencies = [ ] [[package]] -name = "error-chain" -version = "0.12.1" +name = "env_logger" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab49e9dcb602294bc42f9a7dfc9bc6e936fca4418ea300dbfb84fe16de0b7d9" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" dependencies = [ - "backtrace", - "version_check 0.1.5", + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", ] [[package]] -name = "error-code" -version = "2.3.0" +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc 0.2.139", + "winapi 0.3.9", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc 0.2.139", +] + +[[package]] +name = "error-chain" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" +checksum = "3ab49e9dcb602294bc42f9a7dfc9bc6e936fca4418ea300dbfb84fe16de0b7d9" dependencies = [ - "libc 0.2.125", - "str-buf", + "backtrace", + "version_check 0.1.5", ] [[package]] @@ -1602,16 +1789,19 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76b9f5b0b4f53cf836bef05b22cd5239479700bc8d44a04c3c77f1ba6c2c73e9" +version = "0.10.2" +source = "git+https://github.com/pingcap/etcd-client?rev=41d393c32a7a7c728550cee1d9a138dafe6f3e27#41d393c32a7a7c728550cee1d9a138dafe6f3e27" dependencies = [ "http", + "hyper", + "hyper-openssl", + "openssl", "prost", "tokio", "tokio-stream", "tonic", "tonic-build", + "tower", "tower-service", "visible", ] @@ -1623,7 +1813,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7531096570974c3a9dcf9e4b8e1cede1ec26cf5046219fb3b9d897503b9be59" [[package]] -name = "example_plugin" +name = "example_coprocessor_plugin" version = "0.1.0" dependencies = [ "coprocessor_plugin_api", @@ -1633,6 +1823,7 @@ dependencies = [ name = "external_storage" version = "0.0.1" dependencies = [ + "async-compression", "async-trait", "bytes", "encryption", @@ -1652,7 +1843,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "rusoto_core", "rust-ini", "slog", @@ -1662,7 +1853,7 @@ dependencies = [ "tikv_alloc", "tikv_util", "tokio", - "tokio-util 0.7.2", + "tokio-util", "url", ] @@ -1670,6 +1861,7 @@ dependencies = [ name = "external_storage_export" version = "0.0.1" dependencies = [ + "async-compression", "async-trait", "aws", "azure", @@ -1687,14 +1879,14 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "libloading", "matches", - "nix 0.23.0", + "nix 0.24.1", "once_cell", "protobuf", "rust-ini", - "signal", + "signal-hook", "slog", "slog-global", "slog-term", @@ -1702,19 +1894,19 @@ dependencies = [ "tempfile", "tikv_util", "tokio", - "tokio-util 0.7.2", + "tokio-util", "url", ] [[package]] name = "fail" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" dependencies = [ - "lazy_static", "log", - "rand 0.8.3", + "once_cell", + "rand 0.8.5", ] [[package]] @@ -1723,6 +1915,15 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f35ce9c8fb9891c75ceadbc330752951a4e369b50af10775955aeb9af3eee34b" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "ffi-support" version = "0.4.2" @@ -1740,22 +1941,21 @@ dependencies = [ "bcc", "collections", "crc32fast", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", "fs2", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "maligned", - "nix 0.23.0", "online_config", "openssl", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", - "rand 0.8.3", + "rand 0.8.5", "serde", "slog", "slog-global", - "strum", + "strum 0.20.0", "tempfile", "thread_local", "tikv_alloc", @@ -1769,7 +1969,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "thiserror", "winapi 0.3.9", ] @@ -1781,7 +1981,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.139", "redox_syscall 0.2.11", "winapi 0.3.9", ] @@ -1794,15 +1994,35 @@ checksum = "d691fdb3f817632d259d09220d4cf0991dbb2c9e59e044a02a59194bf6e14484" dependencies = [ "cc", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] +[[package]] +name = "fix-hidden-lifetime-bug" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ae9c2016a663983d4e40a9ff967d6dcac59819672f0b47f2b17574e99c33c8" +dependencies = [ + "fix-hidden-lifetime-bug-proc_macros", +] + +[[package]] +name = "fix-hidden-lifetime-bug-proc_macros" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4c81935e123ab0741c4c4f0d9b8377e5fb21d3de7e062fa4b1263b1fbcba1ea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "fixedbitset" -version = "0.2.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" @@ -1822,7 +2042,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2adaffba6388640136149e18ed080b77a78611c1e1d6de75aedcdf78df5d4682" dependencies = [ "crc32fast", - "libc 0.2.125", + "libc 0.2.139", "libz-sys", "miniz_oxide 0.3.7", ] @@ -1863,7 +2083,7 @@ name = "fs2" version = "0.4.3" source = "git+https://github.com/tabokie/fs2-rs?branch=tikv#cd503764a19a99d74c1ab424dd13d6bcd093fcae" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1889,7 +2109,7 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -1969,6 +2189,21 @@ version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.15" @@ -2098,6 +2333,7 @@ dependencies = [ "hyper-tls", "kvproto", "matches", + "pin-project", "slog", "slog-global", "tame-gcs", @@ -2114,7 +2350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" dependencies = [ "typenum", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -2124,7 +2360,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "473a1265acc8ff1e808cd0a1af8cee3c2ee5200916058a2ca113c29f2d903571" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.125", + "libc 0.2.139", "wasi 0.7.0", ] @@ -2136,7 +2372,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "js-sys", - "libc 0.2.125", + "libc 0.2.139", "wasi 0.10.2+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2178,14 +2414,14 @@ dependencies = [ [[package]] name = "grpcio" -version = "0.10.2" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ef249d9cb1b1843767501ae7463b500542e7f9e72d9c2d61ed320fbefa6c79" +checksum = "1f2506de56197d01821c2d1d21082d2dcfd6c82d7a1d6e04d33f37aab6130632" dependencies = [ "futures-executor", "futures-util", "grpcio-sys", - "libc 0.2.125", + "libc 0.2.139", "log", "parking_lot 0.11.1", "protobuf", @@ -2193,18 +2429,18 @@ dependencies = [ [[package]] name = "grpcio-compiler" -version = "0.9.0" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4caa0700833147dcfbe4f0758bd92545cc0f4506ee7fa154e499745a8b24e86c" +checksum = "ed97a17310fd00ff4109357584a00244e2a785d05b7ee0ef4d1e8fb1d84266df" dependencies = [ "protobuf", ] [[package]] name = "grpcio-health" -version = "0.10.0" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "641a95bace445aed36b31ae8731513c4c4d1d3dcdbc05aaeeefefe4fd673ada1" +checksum = "a37eae605cd21f144b7c7fd0e64e57af9f73d132756fef5b706db110c3ec7ea0" dependencies = [ "futures-executor", "futures-util", @@ -2215,14 +2451,14 @@ dependencies = [ [[package]] name = "grpcio-sys" -version = "0.10.1+1.44.0" +version = "0.10.3+1.44.0-patched" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925586932dbbea927e913783da0be160ee74e0b0519d7b20cec35547a0a84631" +checksum = "f23adc509a3c4dea990e0ab8d2add4a65389ee69c288b7851d75dd1df7a6d6c6" dependencies = [ "bindgen 0.59.2", "cc", "cmake", - "libc 0.2.125", + "libc 0.2.139", "libz-sys", "openssl-sys", "pkg-config", @@ -2231,9 +2467,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.3" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -2244,7 +2480,7 @@ dependencies = [ "indexmap", "slab", "tokio", - "tokio-util 0.6.6", + "tokio-util", "tracing", ] @@ -2262,11 +2498,11 @@ checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "hashbrown" -version = "0.12.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c21d40587b92fa6a6c6e3c1bdbf87d75511db5672f9c93175574b3a00df1758" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ - "ahash", + "ahash 0.8.3", ] [[package]] @@ -2290,7 +2526,16 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307c3c9f937f38e3534b1d6447ecf090cafcc9744e4a6360e8b037b2cf5af120" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", +] + +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc 0.2.139", ] [[package]] @@ -2312,7 +2557,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" dependencies = [ "crypto-mac", - "digest", + "digest 0.9.0", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest 0.10.6", ] [[package]] @@ -2328,31 +2582,57 @@ dependencies = [ [[package]] name = "http" -version = "0.2.4" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 0.4.4", + "itoa 1.0.1", ] [[package]] name = "http-body" -version = "0.4.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", "pin-project-lite", ] +[[package]] +name = "http-range-header" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel", + "base64 0.13.0", + "futures-lite", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" -version = "1.4.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" @@ -2368,9 +2648,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.11" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b61cf2d1aebcf6e6352c97b81dc2244ca29194be1b276f5d8ad5c6330fffb11" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -2381,7 +2661,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 0.4.4", + "itoa 1.0.1", "pin-project-lite", "socket2", "tokio", @@ -2450,6 +2730,12 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "if_chain" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" + [[package]] name = "indexmap" version = "1.6.2" @@ -2460,20 +2746,26 @@ dependencies = [ "hashbrown 0.9.1", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inferno" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16d4bde3a7105e59c66a4104cfe9606453af1c7a0eac78cb7d5bc263eb762a70" dependencies = [ - "ahash", + "ahash 0.7.4", "atty", "indexmap", "itoa 1.0.1", "lazy_static", "log", "num-format", - "quick-xml", + "quick-xml 0.22.0", "rgb", "str_stack", ] @@ -2486,7 +2778,7 @@ checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" dependencies = [ "bitflags", "inotify-sys", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -2495,7 +2787,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -2516,13 +2808,23 @@ dependencies = [ "raft", ] +[[package]] +name = "io-lifetimes" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +dependencies = [ + "libc 0.2.139", + "windows-sys 0.42.0", +] + [[package]] name = "iovec" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -2541,12 +2843,15 @@ dependencies = [ ] [[package]] -name = "itertools" -version = "0.9.0" +name = "is-terminal" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" dependencies = [ - "either", + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys 0.42.0", ] [[package]] @@ -2577,7 +2882,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b1d42ef453b30b7387e113da1c83ab1605d90c5b4e0eb8e96d016ed3b8c160" dependencies = [ "getrandom 0.1.12", - "libc 0.2.125", + "libc 0.2.139", "log", ] @@ -2600,6 +2905,15 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "keyed_priority_queue" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d63b6407b66fc81fc539dccf3ddecb669f393c5101b6a2be3976c95099a06e8" +dependencies = [ + "indexmap", +] + [[package]] name = "keys" version = "0.1.0" @@ -2616,12 +2930,12 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#0e2f26c0a46ae7d666d6ca4410046a39e0c96f36" +source = "git+https://github.com/pingcap/kvproto.git#14ac513b9eff75028da1a56f54d36bfb082ac54f" dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", "raft-proto", ] @@ -2709,9 +3023,9 @@ checksum = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122" [[package]] name = "libc" -version = "0.2.125" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libfuzzer-sys" @@ -2745,13 +3059,13 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#de8310c3983a30236ea03f802ed0c2401a4908ae" +source = "git+https://github.com/tikv/rust-rocksdb.git#062638a741adcd9074659eb28cbe7f6a676938d5" dependencies = [ "bindgen 0.57.0", "bzip2-sys", "cc", "cmake", - "libc 0.2.125", + "libc 0.2.139", "libtitan_sys", "libz-sys", "lz4-sys", @@ -2764,12 +3078,12 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#de8310c3983a30236ea03f802ed0c2401a4908ae" +source = "git+https://github.com/tikv/rust-rocksdb.git#062638a741adcd9074659eb28cbe7f6a676938d5" dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.125", + "libc 0.2.139", "libz-sys", "lz4-sys", "snappy-sys", @@ -2783,7 +3097,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.139", "pkg-config", "vcpkg", ] @@ -2803,6 +3117,12 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + [[package]] name = "lock_api" version = "0.4.6" @@ -2839,7 +3159,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -2849,8 +3169,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e88c3cbe8288f77f293e48a28b3232e3defd203a6d839fa7f68ea4329e83464" [[package]] -name = "match_template" +name = "match-template" version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c334ac67725febd94c067736ac46ef1c7cacf1c743ca14b9f917c2df2c20acd8" dependencies = [ "proc-macro2", "quote", @@ -2863,14 +3185,20 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" +[[package]] +name = "matchit" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" + [[package]] name = "md-5" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" dependencies = [ - "block-buffer", - "digest", + "block-buffer 0.9.0", + "digest 0.9.0", "opaque-debug", ] @@ -2886,7 +3214,7 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -2895,7 +3223,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] @@ -2905,7 +3233,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -2917,6 +3245,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memory_trace_macros" version = "0.1.0" @@ -2936,9 +3273,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.14" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd1d63acd1b78403cc0c325605908475dd9b9a3acbf65ed8bcab97e27014afcf" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minimal-lexical" @@ -2976,9 +3313,9 @@ dependencies = [ "fuchsia-zircon-sys", "iovec", "kernel32-sys", - "libc 0.2.125", + "libc 0.2.139", "log", - "miow 0.2.2", + "miow", "net2", "slab", "winapi 0.2.8", @@ -2986,15 +3323,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.0" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "log", - "miow 0.3.7", - "ntapi", - "winapi 0.3.9", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.42.0", ] [[package]] @@ -3021,15 +3357,6 @@ dependencies = [ "ws2_32-sys", ] -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi 0.3.9", -] - [[package]] name = "mmap" version = "0.1.1" @@ -3040,6 +3367,15 @@ dependencies = [ "tempdir", ] +[[package]] +name = "mnt" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1587ebb20a5b04738f16cffa7e2526f1b8496b84f92920facd518362ff1559eb" +dependencies = [ + "libc 0.2.139", +] + [[package]] name = "more-asserts" version = "0.2.1" @@ -3076,10 +3412,10 @@ dependencies = [ ] [[package]] -name = "murmur3" -version = "0.5.1" +name = "mur3" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ead5388e485d38e622630c6b05afd3761a6701ff15c55b279ea5b31dcb62cff" +checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b" [[package]] name = "native-tls" @@ -3088,7 +3424,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8d96b2e1c8da3957d58100b09f102c6d9cfdfced01b7ec5a8974044bb09dbd4" dependencies = [ "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "log", "openssl", "openssl-probe", @@ -3106,46 +3442,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] [[package]] name = "nix" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "becb657d662f1cd2ef38c7ad480ec6b8cf9e96b27adb543e594f9cf0f2e6065c" -dependencies = [ - "bitflags", - "cc", - "cfg-if 0.1.10", - "libc 0.2.125", - "void", -] - -[[package]] -name = "nix" -version = "0.23.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f305c2c2e4c39a82f7bf0bf65fb557f9070ce06781d4f2454295cc34b1c43188" +checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" dependencies = [ "bitflags", - "cc", "cfg-if 1.0.0", - "libc 0.2.125", - "memoffset", + "libc 0.2.139", + "memoffset 0.6.4", ] [[package]] name = "nix" -version = "0.24.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags", "cfg-if 1.0.0", - "libc 0.2.125", - "memoffset", + "libc 0.2.139", + "memoffset 0.7.1", + "pin-utils", + "static_assertions", ] [[package]] @@ -3188,21 +3512,21 @@ checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" dependencies = [ "memchr", "minimal-lexical", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] name = "notify" -version = "4.0.16" +version = "4.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2599080e87c9bd051ddb11b10074f4da7b1223298df65d4c2ec5bcf309af1533" +checksum = "ae03c8c853dba7bfd23e571ff0cff7bc9dceb40a4cd684cd1681824183f45257" dependencies = [ "bitflags", "filetime", "fsevent", "fsevent-sys", "inotify", - "libc 0.2.125", + "libc 0.2.139", "mio 0.6.23", "mio-extras", "walkdir", @@ -3211,9 +3535,9 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26e041cd983acbc087e30fcba770380cfa352d0e392e175b2344ebaf7ea0602" +checksum = "bc51db7b362b205941f71232e56c625156eb9a929f8cf74a428fd5bc094a4afc" dependencies = [ "winapi 0.3.9", ] @@ -3354,8 +3678,17 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ - "hermit-abi", - "libc 0.2.125", + "hermit-abi 0.1.3", + "libc 0.2.139", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc 0.2.139", ] [[package]] @@ -3364,16 +3697,15 @@ version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e47cfc4c0a1a519d9a025ebfbac3a2439d1b5cdf397d72dcb79b11d9920dab" dependencies = [ - "base64", + "base64 0.13.0", "chrono", "getrandom 0.2.3", "http", - "rand 0.8.3", - "reqwest", + "rand 0.8.5", "serde", "serde_json", "serde_path_to_error", - "sha2", + "sha2 0.9.1", "thiserror", "url", ] @@ -3389,9 +3721,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.10.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" [[package]] name = "online_config" @@ -3426,18 +3758,30 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.38" +version = "0.10.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" +checksum = "7e30d8bc91859781f0a943411186324d580f2bbeb71b452fe91ae344806af3f1" dependencies = [ "bitflags", "cfg-if 1.0.0", "foreign-types", - "libc 0.2.125", + "libc 0.2.139", "once_cell", + "openssl-macros", "openssl-sys", ] +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "openssl-probe" version = "0.1.2" @@ -3446,22 +3790,21 @@ checksum = "77af24da69f9d9341038eba93a073b1fdaaa1b788221b00a69bce9e762cb32de" [[package]] name = "openssl-src" -version = "111.17.0+1.1.1m" +version = "111.25.0+1.1.1t" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d6a336abd10814198f66e2a91ccd7336611f30334119ca8ce300536666fcf4" +checksum = "3173cd3626c43e3854b1b727422a276e568d9ec5fe8cec197822cf52cfb743d6" dependencies = [ "cc", ] [[package]] name = "openssl-sys" -version = "0.9.72" +version = "0.9.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb" +checksum = "0d3d193fb1488ad46ffe3aaabc912cc931d02ee8518fe2959aea8ef52718b0c0" dependencies = [ - "autocfg", "cc", - "libc 0.2.125", + "libc 0.2.139", "openssl-src", "pkg-config", "vcpkg", @@ -3469,18 +3812,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" -dependencies = [ - "num-traits", -] - -[[package]] -name = "ordered-float" -version = "2.7.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "039f02eb0f69271f26abe3202189275d7aa2258b903cb0281b5de710a2570ff3" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" dependencies = [ "num-traits", ] @@ -3500,7 +3834,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3508,6 +3842,12 @@ dependencies = [ name = "panic_hook" version = "0.0.1" +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" + [[package]] name = "parking_lot" version = "0.11.1" @@ -3521,9 +3861,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core 0.9.1", @@ -3537,7 +3877,7 @@ checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" dependencies = [ "cfg-if 1.0.0", "instant", - "libc 0.2.125", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "winapi 0.3.9", @@ -3550,10 +3890,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", - "windows-sys", + "windows-sys 0.32.0", ] [[package]] @@ -3585,6 +3925,7 @@ dependencies = [ "log", "log_wrappers", "prometheus", + "prometheus-static-metric", "security", "semver 0.10.0", "serde", @@ -3626,7 +3967,7 @@ checksum = "b8f94885300e262ef461aa9fd1afbf7df3caf9e84e271a74925d1c6c8b24830f" dependencies = [ "bitflags", "byteorder", - "libc 0.2.125", + "libc 0.2.139", "mmap", "nom 4.2.3", "phf", @@ -3644,9 +3985,9 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" dependencies = [ "fixedbitset", "indexmap", @@ -3678,7 +4019,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d43f3220d96e0080cc9ea234978ccd80d904eafb17be31bb0f76daaea6493082" dependencies = [ "phf_shared", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] @@ -3692,18 +4033,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" +checksum = "78203e83c48cffbe01e4a2d35d566ca4de445d79a85372fc64e378bfc812a260" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" +checksum = "710faf75e1b33345361201d36d04e98ac1ed8909151a017ed384700836104c74" dependencies = [ "proc-macro2", "quote", @@ -3712,9 +4053,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.6" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -3769,7 +4110,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27361d7578b410d0eb5fe815c2b2105b01ab770a7c738cb9a231457a809fcc7" dependencies = [ "ipnetwork", - "libc 0.2.125", + "libc 0.2.139", "pnet_base", "pnet_sys", "winapi 0.2.8", @@ -3781,25 +4122,26 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82f881a6d75ac98c5541db6144682d1773bb14c6fc50c6ebac7086c8f7f23c29" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.2.8", "ws2_32-sys", ] [[package]] name = "pprof" -version = "0.9.1" -source = "git+https://github.com/tikv/pprof-rs.git?rev=3fed55af8fc6cf69dbd954a0321c799c5a111e4e#3fed55af8fc6cf69dbd954a0321c799c5a111e4e" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "196ded5d4be535690899a4631cc9f18cdc41b7ebf24a79400f46f48e49a11059" dependencies = [ "backtrace", "cfg-if 1.0.0", "findshlibs", "inferno", - "libc 0.2.125", + "libc 0.2.139", "log", - "nix 0.24.1", + "nix 0.26.2", "once_cell", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "protobuf", "protobuf-codegen-pure", "smallvec", @@ -3814,6 +4156,16 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" +[[package]] +name = "prettyplease" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -3824,7 +4176,7 @@ dependencies = [ "proc-macro2", "quote", "syn", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -3835,7 +4187,7 @@ checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -3852,11 +4204,11 @@ checksum = "369a6ed065f249a159e06c45752c780bda2fb53c995718f9e484d08daa9eb42e" [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] @@ -3869,7 +4221,7 @@ dependencies = [ "byteorder", "hex 0.4.2", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -3878,7 +4230,7 @@ version = "0.4.2" source = "git+https://github.com/tikv/procinfo-rs?rev=6599eb9dca74229b2c1fcc44118bef7eff127128#6599eb9dca74229b2c1fcc44118bef7eff127128" dependencies = [ "byteorder", - "libc 0.2.125", + "libc 0.2.139", "nom 2.2.1", "rustc_version 0.2.3", ] @@ -3903,7 +4255,7 @@ dependencies = [ "cfg-if 1.0.0", "fnv", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "memchr", "parking_lot 0.11.1", "protobuf", @@ -3925,9 +4277,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" +checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a" dependencies = [ "bytes", "prost-derive", @@ -3935,30 +4287,34 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" +checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511" dependencies = [ "bytes", - "heck 0.3.1", - "itertools 0.10.0", + "heck 0.4.0", + "itertools", + "lazy_static", "log", "multimap", "petgraph", + "prettyplease", "prost", "prost-types", + "regex", + "syn", "tempfile", "which", ] [[package]] name = "prost-derive" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" +checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" dependencies = [ "anyhow", - "itertools 0.10.0", + "itertools", "proc-macro2", "quote", "syn", @@ -3966,9 +4322,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" +checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" dependencies = [ "bytes", "prost", @@ -3997,6 +4353,18 @@ dependencies = [ "regex", ] +[[package]] +name = "protobuf-build" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb3c02f54ecaf12572c1a60dbdb36b1f8f713a16105881143f2be84cca5bbe3" +dependencies = [ + "bitflags", + "protobuf", + "protobuf-codegen", + "regex", +] + [[package]] name = "protobuf-codegen" version = "2.8.0" @@ -4025,11 +4393,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" -version = "1.0.9" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" dependencies = [ "proc-macro2", ] @@ -4037,39 +4415,40 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "fxhash", "getset", "protobuf", "raft-proto", - "rand 0.8.3", + "rand 0.8.5", "slog", "thiserror", ] [[package]] name = "raft-engine" -version = "0.1.0" -source = "git+https://github.com/tikv/raft-engine.git#0e066f8626b43b2a8a0a6bc9c7f0502b6fdc3d05" +version = "0.3.0" +source = "git+https://github.com/tikv/raft-engine.git#39f4db451295dbd8b30db4f94f220182c2c65be9" dependencies = [ "byteorder", "crc32fast", "crossbeam", "fail", "fs2", - "hashbrown 0.12.0", + "hashbrown 0.13.2", "hex 0.4.2", + "if_chain", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "log", "lz4-sys", "memmap2", - "nix 0.24.1", + "nix 0.26.2", "num-derive", "num-traits", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "protobuf", @@ -4077,33 +4456,36 @@ dependencies = [ "rhai", "scopeguard", "serde", + "serde_repr", + "strum 0.24.1", "thiserror", ] [[package]] name = "raft-engine-ctl" -version = "0.1.0" -source = "git+https://github.com/tikv/raft-engine.git#0e066f8626b43b2a8a0a6bc9c7f0502b6fdc3d05" +version = "0.3.0" +source = "git+https://github.com/tikv/raft-engine.git#39f4db451295dbd8b30db4f94f220182c2c65be9" dependencies = [ "clap 3.1.6", - "env_logger", + "env_logger 0.10.0", "raft-engine", ] [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "protobuf", - "protobuf-build", + "protobuf-build 0.14.0", ] [[package]] name = "raft_log_engine" version = "0.0.1" dependencies = [ + "codec", "encryption", "engine_traits", "file_system", @@ -4118,8 +4500,10 @@ dependencies = [ "serde_derive", "slog", "slog-global", + "tempfile", "tikv_util", - "time", + "time 0.1.42", + "tracker", ] [[package]] @@ -4130,6 +4514,7 @@ dependencies = [ "bitflags", "byteorder", "bytes", + "causal_ts", "collections", "concurrency_manager", "crc32fast", @@ -4150,7 +4535,7 @@ dependencies = [ "getset", "grpcio-health", "into_other", - "itertools 0.10.0", + "itertools", "keys", "kvproto", "lazy_static", @@ -4159,16 +4544,17 @@ dependencies = [ "memory_trace_macros", "online_config", "openssl", - "ordered-float 2.7.0", + "ordered-float", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", "protobuf", "raft", "raft-proto", - "rand 0.8.3", + "rand 0.8.5", + "resource_control", "resource_metering", "serde", "serde_derive", @@ -4183,21 +4569,69 @@ dependencies = [ "tidb_query_datatype", "tikv_alloc", "tikv_util", - "time", + "time 0.1.42", "tokio", + "tracker", "txn_types", - "uuid", + "uuid 0.8.2", "yatp", ] [[package]] -name = "rand" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" +name = "raftstore-v2" +version = "0.1.0" +dependencies = [ + "batch-system", + "bytes", + "causal_ts", + "collections", + "concurrency_manager", + "crossbeam", + "encryption_export", + "engine_rocks", + "engine_test", + "engine_traits", + "error_code", + "fail", + "file_system", + "fs2", + "futures 0.3.15", + "keys", + "kvproto", + "log_wrappers", + "parking_lot 0.12.1", + "pd_client", + "prometheus", + "protobuf", + "raft", + "raft-proto", + "raftstore", + "rand 0.8.5", + "resource_control", + "resource_metering", + "slog", + "slog-global", + "smallvec", + "sst_importer", + "tempfile", + "test_pd", + "test_util", + "thiserror", + "tikv_util", + "time 0.1.42", + "tracker", + "txn_types", + "yatp", +] + +[[package]] +name = "rand" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" dependencies = [ "fuchsia-cprng", - "libc 0.2.125", + "libc 0.2.139", "rand_core 0.3.1", "rdrand", "winapi 0.3.9", @@ -4210,22 +4644,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom 0.1.12", - "libc 0.2.125", + "libc 0.2.139", "rand_chacha 0.2.1", "rand_core 0.5.1", - "rand_hc 0.2.0", + "rand_hc", ] [[package]] name = "rand" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "rand_chacha 0.3.0", "rand_core 0.6.2", - "rand_hc 0.3.0", ] [[package]] @@ -4290,15 +4723,6 @@ dependencies = [ "rand_core 0.5.1", ] -[[package]] -name = "rand_hc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" -dependencies = [ - "rand_core 0.6.2", -] - [[package]] name = "rand_isaac" version = "0.3.0" @@ -4328,9 +4752,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.0" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -4340,14 +4764,13 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.0" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static", + "crossbeam-utils 0.8.8", "num_cpus", ] @@ -4387,9 +4810,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.5.4" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" dependencies = [ "aho-corasick", "memchr", @@ -4407,9 +4830,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" [[package]] name = "remove_dir_all" @@ -4426,7 +4849,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0460542b551950620a3648c6aa23318ac6b3cd779114bd873209e6e8b5eb1c34" dependencies = [ - "base64", + "base64 0.13.0", "bytes", "encoding_rs 0.8.29 (registry+https://github.com/rust-lang/crates.io-index)", "futures-core", @@ -4444,7 +4867,6 @@ dependencies = [ "percent-encoding", "pin-project-lite", "serde", - "serde_json", "serde_urlencoded", "tokio", "tokio-native-tls", @@ -4483,6 +4905,7 @@ dependencies = [ "slog-global", "tempfile", "test_raftstore", + "test_sst_importer", "test_util", "thiserror", "tikv", @@ -4492,6 +4915,35 @@ dependencies = [ "txn_types", ] +[[package]] +name = "resource_control" +version = "0.0.1" +dependencies = [ + "byteorder", + "collections", + "crossbeam", + "crossbeam-skiplist", + "dashmap", + "fail", + "futures 0.3.15", + "kvproto", + "lazy_static", + "online_config", + "parking_lot 0.12.1", + "pd_client", + "pin-project", + "prometheus", + "protobuf", + "rand 0.8.5", + "serde", + "slog", + "slog-global", + "test_pd", + "test_pd_client", + "tikv_util", + "yatp", +] + [[package]] name = "resource_metering" version = "0.0.1" @@ -4502,19 +4954,18 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "log", "online_config", "pdqselect", "pin-project", "procinfo", "prometheus", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", "slog-global", - "thread-id", "tikv_util", ] @@ -4526,17 +4977,21 @@ checksum = "18eb52b6664d331053136fcac7e4883bdc6f5fc04a6aab3b0f75eafb80ab88b3" [[package]] name = "rgb" -version = "0.8.14" +version = "0.8.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089e4031214d129e201f8c3c8c2fe97cd7322478a0d1cdf78e7029b0042efdb" +checksum = "e74fdc210d8f24a7dbfedc13b04ba5764f5232754ccebfdf5fff1bad791ccbc6" +dependencies = [ + "bytemuck", +] [[package]] name = "rhai" -version = "1.4.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898b114d6cfa18af4593393fdc6c7437118e7e624d97f635fba8c75fd5c06f56" +checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" dependencies = [ - "ahash", + "ahash 0.7.4", + "bitflags", "instant", "num-traits", "rhai_codegen", @@ -4546,9 +5001,9 @@ dependencies = [ [[package]] name = "rhai_codegen" -version = "1.3.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e02d33d76a7aa8ec72ac8298d5b52134fd2dff77445ada0c65f6f8c40d8f2931" +checksum = "75a39bc2aa9258b282ee5518dac493491a9c4c11a6d7361b9d2644c922fc6488" dependencies = [ "proc-macro2", "quote", @@ -4562,7 +5017,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b72b84d47e8ec5a4f2872e8262b8f8256c5be1c938a7d6d3a867a3ba8f722f74" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.139", "once_cell", "spin", "untrusted", @@ -4573,19 +5028,19 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#de8310c3983a30236ea03f802ed0c2401a4908ae" +source = "git+https://github.com/tikv/rust-rocksdb.git#062638a741adcd9074659eb28cbe7f6a676938d5" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "librocksdb_sys", ] [[package]] name = "rusoto_core" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", - "base64", + "base64 0.13.0", "bytes", "crc32fast", "futures 0.3.15", @@ -4606,7 +5061,7 @@ dependencies = [ [[package]] name = "rusoto_credential" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "chrono", @@ -4623,7 +5078,7 @@ dependencies = [ [[package]] name = "rusoto_kms" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -4636,7 +5091,7 @@ dependencies = [ [[package]] name = "rusoto_mock" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "chrono", @@ -4650,7 +5105,7 @@ dependencies = [ [[package]] name = "rusoto_s3" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -4664,15 +5119,15 @@ dependencies = [ [[package]] name = "rusoto_signature" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ - "base64", + "base64 0.13.0", "bytes", "chrono", - "digest", + "digest 0.9.0", "futures 0.3.15", "hex 0.4.2", - "hmac", + "hmac 0.10.1", "http", "hyper", "log", @@ -4682,14 +5137,14 @@ dependencies = [ "rusoto_credential", "rustc_version 0.3.3", "serde", - "sha2", + "sha2 0.9.1", "tokio", ] [[package]] name = "rusoto_sts" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -4752,16 +5207,17 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.19.1" +name = "rustix" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" dependencies = [ - "base64", - "log", - "ring", - "sct", - "webpki", + "bitflags", + "errno", + "io-lifetimes", + "libc 0.2.139", + "linux-raw-sys", + "windows-sys 0.42.0", ] [[package]] @@ -4807,16 +5263,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "sct" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "seahash" version = "4.1.0" @@ -4830,12 +5276,12 @@ dependencies = [ "collections", "encryption", "grpcio", + "kvproto", "serde", "serde_derive", "serde_json", "tempfile", "tikv_util", - "tonic", ] [[package]] @@ -4847,7 +5293,7 @@ dependencies = [ "bitflags", "core-foundation", "core-foundation-sys", - "libc 0.2.125", + "libc 0.2.139", "security-framework-sys", ] @@ -4858,7 +5304,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339" dependencies = [ "core-foundation-sys", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -4912,25 +5358,13 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.106" +version = "1.0.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36df6ac6412072f67cf767ebbde4133a5b2e88e76dc6187fa7104cd16f783399" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" dependencies = [ "serde_derive", ] -[[package]] -name = "serde-xml-rs" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0bf1ba0696ccf0872866277143ff1fd14d22eec235d2b23702f95e6660f7dfa" -dependencies = [ - "log", - "serde", - "thiserror", - "xml-rs", -] - [[package]] name = "serde_cbor" version = "0.11.1" @@ -4943,9 +5377,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.106" +version = "1.0.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e549e3abf4fb8621bd1609f11dfc9f5e50320802273b12f3811a67e6716ea6c" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" dependencies = [ "proc-macro2", "quote", @@ -4982,6 +5416,28 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror", +] + +[[package]] +name = "serde_repr" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fe39d9fbb0ebf5eb2c7cb7e2a47e4f462fad1379f1166b8ae49ad9eae89a7ca" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_urlencoded" version = "0.7.0" @@ -5043,24 +5499,26 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.125", + "libc 0.2.139", "log", "log_wrappers", - "nix 0.23.0", "pd_client", "prometheus", "protobuf", "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "raftstore-v2", + "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "serde_json", - "signal", + "signal-hook", "slog", "slog-global", + "snap_recovery", "tempfile", "tikv", "tikv_alloc", @@ -5077,13 +5535,24 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2933378ddfeda7ea26f48c555bdad8bb446bf8a3d17832dc83e380d444cfb8c1" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if 0.1.10", "cpuid-bool", - "digest", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "sha2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.10.6", +] + [[package]] name = "shlex" version = "0.1.1" @@ -5097,22 +5566,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] -name = "signal" -version = "0.6.0" +name = "signal-hook" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106428d9d96840ecdec5208c13ab8a4e28c38da1e0ccf2909fb44e41b992f897" +checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ - "libc 0.2.125", - "nix 0.11.1", + "libc 0.2.139", + "signal-hook-registry", ] [[package]] name = "signal-hook-registry" -version = "1.2.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce32ea0c6c56d5eacaeb814fbed9960547021d3edd010ded1425f180536b20ab" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -5200,11 +5669,43 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "smartstring" -version = "0.2.10" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e714dff2b33f2321fdcd475b71cec79781a692d846f37f415fb395a1d2bcd48e" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" dependencies = [ + "autocfg", "static_assertions", + "version_check 0.9.4", +] + +[[package]] +name = "snap_recovery" +version = "0.1.0" +dependencies = [ + "chrono", + "encryption", + "encryption_export", + "engine_rocks", + "engine_traits", + "futures 0.3.15", + "grpcio", + "keys", + "kvproto", + "log", + "pd_client", + "protobuf", + "raft_log_engine", + "raftstore", + "slog", + "slog-global", + "structopt", + "tempfile", + "thiserror", + "tikv", + "tikv_alloc", + "tikv_util", + "toml", + "txn_types", ] [[package]] @@ -5213,7 +5714,7 @@ version = "0.1.0" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", - "libc 0.2.125", + "libc 0.2.139", "pkg-config", ] @@ -5237,11 +5738,11 @@ dependencies = [ [[package]] name = "socket2" -version = "0.4.4" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "winapi 0.3.9", ] @@ -5271,8 +5772,10 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", + "online_config", "openssl", "prometheus", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -5285,7 +5788,7 @@ dependencies = [ "tikv_util", "tokio", "txn_types", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -5300,12 +5803,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "str-buf" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44a3643b4ff9caf57abcee9c2c621d6c03d9135e0d8b589bd9afb5992cb176a" - [[package]] name = "str_stack" version = "0.1.0" @@ -5360,7 +5857,16 @@ version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" dependencies = [ - "strum_macros", + "strum_macros 0.20.1", +] + +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +dependencies = [ + "strum_macros 0.24.2", ] [[package]] @@ -5375,29 +5881,42 @@ dependencies = [ "syn", ] +[[package]] +name = "strum_macros" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4faebde00e8ff94316c01800f9054fd2ba77d30d9e922541913051d1d978918b" +dependencies = [ + "heck 0.4.0", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" -version = "2.3.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "symbolic-common" -version = "8.0.0" +version = "10.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0caab39ce6f074031b8fd3dd297bfda70a2d1f33c6e7cc1b737ac401f856448d" +checksum = "ac457d054f793cedfde6f32d21d692b8351cfec9084fefd0470c0373f6d799bc" dependencies = [ "debugid", - "memmap", + "memmap2", "stable_deref_trait", - "uuid", + "uuid 1.2.1", ] [[package]] name = "symbolic-demangle" -version = "8.0.0" +version = "10.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b77ecb5460a87faa37ed53521eed8f073c8339b7a5788c1f93efc09ce74e1b68" +checksum = "48808b846eef84e0ac06365dc620f028ae632355e5dcffc007bf1b2bf5eab17b" dependencies = [ "rustc-demangle", "symbolic-common", @@ -5405,25 +5924,30 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.86" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" + [[package]] name = "sysinfo" -version = "0.16.4" +version = "0.26.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c280c91abd1aed2e36be1bc8f56fbc7a2acbb2b58fbcac9641510179cc72dd9" +checksum = "ade661fa5e048ada64ad7901713301c21d2dbc5b65ee7967de8826c111452960" dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", - "doc-comment", - "libc 0.2.125", + "libc 0.2.139", "ntapi", "once_cell", "rayon", @@ -5442,7 +5966,7 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d20ec2d6525a66afebdff9e1d8ef143c9deae9a3b040c61d3cfa9ae6fda80060" dependencies = [ - "base64", + "base64 0.13.0", "bytes", "chrono", "futures-util", @@ -5462,7 +5986,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9435c9348e480fad0f2215d5602e2dfad03df8a6398c4e7ceaeaa42758f26a8a" dependencies = [ - "base64", + "base64 0.13.0", "chrono", "http", "lock_api", @@ -5506,8 +6030,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.125", - "rand 0.8.3", + "libc 0.2.139", + "rand 0.8.5", "redox_syscall 0.2.11", "remove_dir_all", "winapi 0.3.9", @@ -5551,7 +6075,7 @@ dependencies = [ "grpcio", "kvproto", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "tempfile", "test_raftstore", "tidb_query_common", @@ -5590,11 +6114,35 @@ dependencies = [ "futures 0.3.15", "grpcio", "kvproto", + "log_wrappers", "pd_client", "security", "slog", "slog-global", "tikv_util", + "tokio", + "tokio-stream", +] + +[[package]] +name = "test_pd_client" +version = "0.0.1" +dependencies = [ + "collections", + "fail", + "futures 0.3.15", + "grpcio", + "keys", + "kvproto", + "log_wrappers", + "pd_client", + "raft", + "slog", + "slog-global", + "tikv_util", + "tokio", + "tokio-timer", + "txn_types", ] [[package]] @@ -5625,14 +6173,16 @@ dependencies = [ "protobuf", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "server", "slog", "slog-global", "tempfile", + "test_pd_client", "test_util", "tikv", "tikv_util", @@ -5641,6 +6191,63 @@ dependencies = [ "txn_types", ] +[[package]] +name = "test_raftstore-v2" +version = "0.0.1" +dependencies = [ + "api_version", + "backtrace", + "causal_ts", + "collections", + "concurrency_manager", + "crossbeam", + "encryption_export", + "engine_rocks", + "engine_rocks_helper", + "engine_test", + "engine_traits", + "fail", + "file_system", + "futures 0.3.15", + "grpcio", + "grpcio-health", + "keys", + "kvproto", + "lazy_static", + "log_wrappers", + "pd_client", + "protobuf", + "raft", + "raftstore", + "raftstore-v2", + "rand 0.8.5", + "resolved_ts", + "resource_control", + "resource_metering", + "security", + "server", + "slog", + "slog-global", + "tempfile", + "test_pd_client", + "test_raftstore", + "test_util", + "tikv", + "tikv_util", + "tokio", + "tokio-timer", + "txn_types", +] + +[[package]] +name = "test_raftstore_macro" +version = "0.0.1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "test_sst_importer" version = "0.1.0" @@ -5650,7 +6257,7 @@ dependencies = [ "engine_traits", "keys", "kvproto", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -5666,6 +6273,7 @@ dependencies = [ "test_raftstore", "tikv", "tikv_util", + "tracker", "txn_types", ] @@ -5679,14 +6287,14 @@ dependencies = [ "fail", "grpcio", "kvproto", - "rand 0.8.3", + "rand 0.8.5", "rand_isaac", "security", "slog", "slog-global", "tempfile", "tikv_util", - "time", + "time 0.1.42", ] [[package]] @@ -5695,6 +6303,7 @@ version = "0.0.1" dependencies = [ "api_version", "arrow", + "async-trait", "batch-system", "byteorder", "causal_ts", @@ -5709,6 +6318,7 @@ dependencies = [ "encryption", "engine_rocks", "engine_rocks_helper", + "engine_test", "engine_traits", "error_code", "external_storage_export", @@ -5720,7 +6330,7 @@ dependencies = [ "hyper", "keys", "kvproto", - "libc 0.2.125", + "libc 0.2.139", "log_wrappers", "more-asserts", "online_config", @@ -5734,8 +6344,9 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "rand_xorshift", + "resource_control", "resource_metering", "security", "serde_json", @@ -5746,7 +6357,10 @@ dependencies = [ "test_backup", "test_coprocessor", "test_pd", + "test_pd_client", "test_raftstore", + "test_raftstore-v2", + "test_raftstore_macro", "test_sst_importer", "test_storage", "test_util", @@ -5756,14 +6370,15 @@ dependencies = [ "tidb_query_executors", "tidb_query_expr", "tikv", + "tikv_kv", "tikv_util", - "time", + "time 0.1.42", "tipb", "tipb_helper", "tokio", "toml", "txn_types", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -5801,17 +6416,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread-id" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fdfe0627923f7411a43ec9ec9c39c3a9b4151be313e0922042581fb6c9b717f" -dependencies = [ - "libc 0.2.125", - "redox_syscall 0.2.11", - "winapi 0.3.9", -] - [[package]] name = "thread_local" version = "1.1.4" @@ -5825,7 +6429,7 @@ dependencies = [ name = "tidb_query_aggr" version = "0.0.1" dependencies = [ - "match_template", + "match-template", "panic_hook", "tidb_query_codegen", "tidb_query_common", @@ -5852,9 +6456,12 @@ name = "tidb_query_common" version = "0.0.1" dependencies = [ "anyhow", + "api_version", + "async-trait", "byteorder", "derive_more", "error_code", + "futures 0.3.15", "kvproto", "lazy_static", "log_wrappers", @@ -5863,13 +6470,16 @@ dependencies = [ "serde_json", "thiserror", "tikv_util", - "time", + "time 0.1.42", + "yatp", ] [[package]] name = "tidb_query_datatype" version = "0.0.1" dependencies = [ + "api_version", + "base64 0.13.0", "bitfield", "bitflags", "boolinator", @@ -5878,18 +6488,19 @@ dependencies = [ "chrono-tz", "codec", "collections", + "crc32fast", "encoding_rs 0.8.29 (git+https://github.com/xiongjiwei/encoding_rs.git?rev=68e0bc5a72a37a78228d80cd98047326559cf43c)", "error_code", "hex 0.4.2", "kvproto", "lazy_static", "log_wrappers", - "match_template", - "nom 5.1.0", + "match-template", + "nom 7.1.0", "num 0.3.0", "num-derive", "num-traits", - "ordered-float 1.1.1", + "ordered-float", "protobuf", "regex", "serde", @@ -5909,14 +6520,16 @@ name = "tidb_query_executors" version = "0.0.1" dependencies = [ "anyhow", + "api_version", + "async-trait", "codec", "collections", "fail", "futures 0.3.15", - "itertools 0.10.0", + "itertools", "kvproto", "log_wrappers", - "match_template", + "match-template", "protobuf", "slog", "slog-global", @@ -5936,7 +6549,7 @@ dependencies = [ name = "tidb_query_expr" version = "0.0.1" dependencies = [ - "base64", + "base64 0.13.0", "bstr", "byteorder", "chrono", @@ -5945,14 +6558,14 @@ dependencies = [ "flate2", "hex 0.4.2", "log_wrappers", - "match_template", + "match-template", "num 0.3.0", "num-traits", "openssl", "panic_hook", "profiler", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "regex", "safemem", "serde", @@ -5962,16 +6575,16 @@ dependencies = [ "tidb_query_common", "tidb_query_datatype", "tikv_util", - "time", + "time 0.1.42", "tipb", "tipb_helper", "twoway", - "uuid", + "uuid 0.8.2", ] [[package]] name = "tikv" -version = "6.1.0-alpha" +version = "7.2.0-alpha" dependencies = [ "anyhow", "api_version", @@ -5990,6 +6603,7 @@ dependencies = [ "crc32fast", "crc64fast", "crossbeam", + "dashmap", "encryption_export", "engine_panic", "engine_rocks", @@ -5997,7 +6611,7 @@ dependencies = [ "engine_traits", "engine_traits_tests", "error_code", - "example_plugin", + "example_coprocessor_plugin", "fail", "file_system", "flate2", @@ -6005,6 +6619,8 @@ dependencies = [ "futures-executor", "futures-timer", "futures-util", + "fxhash", + "getset", "grpcio", "grpcio-health", "hex 0.4.2", @@ -6013,19 +6629,20 @@ dependencies = [ "hyper-openssl", "hyper-tls", "into_other", - "itertools 0.10.0", + "itertools", + "keyed_priority_queue", "keys", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "libloading", "log", "log_wrappers", - "match_template", + "match-template", "memory_trace_macros", "mime", "more-asserts", - "murmur3", + "mur3", "nom 5.1.0", "notify", "num-traits", @@ -6033,7 +6650,7 @@ dependencies = [ "online_config", "openssl", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "paste", "pd_client", "pin-project", @@ -6046,9 +6663,11 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", + "raftstore-v2", "rand 0.7.3", "regex", "reqwest", + "resource_control", "resource_metering", "rev_lines", "seahash", @@ -6060,8 +6679,10 @@ dependencies = [ "serde_json", "slog", "slog-global", + "smallvec", "sst_importer", - "strum", + "strum 0.20.0", + "sync_wrapper", "sysinfo", "tempfile", "test_sst_importer", @@ -6075,15 +6696,16 @@ dependencies = [ "tikv_alloc", "tikv_kv", "tikv_util", - "time", + "time 0.1.42", "tipb", "tokio", "tokio-openssl", "tokio-timer", "toml", + "tracker", "txn_types", "url", - "uuid", + "uuid 0.8.2", "walkdir", "yatp", "zipf", @@ -6112,10 +6734,9 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.125", + "libc 0.2.139", "log", "log_wrappers", - "nix 0.23.0", "pd_client", "prometheus", "protobuf", @@ -6123,12 +6744,12 @@ dependencies = [ "raft-engine-ctl", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "regex", "security", "serde_json", "server", - "signal", + "signal-hook", "slog", "slog-global", "structopt", @@ -6136,7 +6757,7 @@ dependencies = [ "tikv", "tikv_alloc", "tikv_util", - "time", + "time 0.1.42", "tokio", "toml", "txn_types", @@ -6144,33 +6765,33 @@ dependencies = [ [[package]] name = "tikv-jemalloc-ctl" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb833c46ecbf8b6daeccb347cefcabf9c1beb5c9b0f853e1cec45632d9963e69" +checksum = "e37706572f4b151dff7a0146e040804e9c26fe3a3118591112f05cf12a4216c1" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "paste", "tikv-jemalloc-sys", ] [[package]] name = "tikv-jemalloc-sys" -version = "0.4.3+5.2.1-patched.2" +version = "0.5.0+5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1792ccb507d955b46af42c123ea8863668fae24d03721e40cad6a41773dbb49" +checksum = "aeab4310214fe0226df8bfeb893a291a58b19682e8a07e1e1d4483ad4200d315" dependencies = [ "cc", "fs_extra", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] name = "tikv-jemallocator" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5b7bcecfafe4998587d636f9ae9d55eb9d0499877b88757767c346875067098" +checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "tikv-jemalloc-sys", ] @@ -6180,9 +6801,10 @@ version = "0.0.1" dependencies = [ "cc", "clap 2.33.0", + "serde_json", "server", "tikv", - "time", + "time 0.1.42", "toml", ] @@ -6192,7 +6814,7 @@ version = "0.1.0" dependencies = [ "fxhash", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "mimalloc", "snmalloc-rs", "tcmalloc", @@ -6207,8 +6829,10 @@ name = "tikv_kv" version = "0.1.0" dependencies = [ "backtrace", + "collections", "engine_panic", "engine_rocks", + "engine_test", "engine_traits", "error_code", "fail", @@ -6222,6 +6846,7 @@ dependencies = [ "pd_client", "prometheus", "prometheus-static-metric", + "raft", "raftstore", "slog", "slog-global", @@ -6229,6 +6854,7 @@ dependencies = [ "tempfile", "thiserror", "tikv_util", + "tracker", "txn_types", ] @@ -6246,6 +6872,7 @@ dependencies = [ "cpu-time", "crc32fast", "crossbeam", + "crossbeam-skiplist", "derive_more", "error_code", "fail", @@ -6256,22 +6883,25 @@ dependencies = [ "http", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", "log", "log_wrappers", - "nix 0.23.0", + "mnt", + "nix 0.24.1", "num-traits", "num_cpus", "online_config", "openssl", "page_size", "panic_hook", + "parking_lot_core 0.9.1", + "pin-project", "procfs", "procinfo", "prometheus", "prometheus-static-metric", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "regex", "rusoto_core", "serde", @@ -6285,11 +6915,12 @@ dependencies = [ "tempfile", "thiserror", "tikv_alloc", - "time", + "time 0.1.42", "tokio", "tokio-executor", "tokio-timer", "toml", + "tracker", "url", "utime", "yatp", @@ -6301,11 +6932,40 @@ version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" dependencies = [ - "libc 0.2.125", + "libc 0.2.139", "redox_syscall 0.1.56", "winapi 0.3.9", ] +[[package]] +name = "time" +version = "0.3.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +dependencies = [ + "itoa 1.0.1", + "libc 0.2.139", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" + +[[package]] +name = "time-macros" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +dependencies = [ + "time-core", +] + [[package]] name = "tinytemplate" version = "1.2.0" @@ -6319,12 +6979,12 @@ dependencies = [ [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#f3286471a05a4454a1071dd5f66ac7dbf6c79ba3" +source = "git+https://github.com/pingcap/tipb.git#955fbdc879517f16b7a2f5967f143b92a6ab03dd" dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", ] [[package]] @@ -6338,17 +6998,17 @@ dependencies = [ [[package]] name = "tokio" -version = "1.17.0" +version = "1.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ + "autocfg", "bytes", - "libc 0.2.125", + "libc 0.2.139", "memchr", - "mio 0.8.0", + "mio 0.8.5", "num_cpus", - "once_cell", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", @@ -6409,22 +7069,11 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-rustls" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls", - "tokio", - "webpki", -] - [[package]] name = "tokio-stream" -version = "0.1.8" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -6442,20 +7091,6 @@ dependencies = [ "tokio-executor", ] -[[package]] -name = "tokio-util" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-util" version = "0.7.2" @@ -6468,6 +7103,7 @@ dependencies = [ "futures-sink", "pin-project-lite", "tokio", + "tracing", ] [[package]] @@ -6481,13 +7117,14 @@ dependencies = [ [[package]] name = "tonic" -version = "0.5.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "796c5e1cd49905e65dd8e700d4cb1dffcbfdb4fc9d017de08c1a537afd83627c" +checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec" dependencies = [ "async-stream 0.3.3", "async-trait", - "base64", + "axum", + "base64 0.13.0", "bytes", "futures-core", "futures-util", @@ -6501,9 +7138,8 @@ dependencies = [ "prost", "prost-derive", "tokio", - "tokio-rustls", "tokio-stream", - "tokio-util 0.6.6", + "tokio-util", "tower", "tower-layer", "tower-service", @@ -6513,10 +7149,11 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.5.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b52d07035516c2b74337d2ac7746075e7dcae7643816c1b12c5ff8a7484c08" +checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc" dependencies = [ + "prettyplease", "proc-macro2", "prost-build", "quote", @@ -6525,24 +7162,43 @@ dependencies = [ [[package]] name = "tower" -version = "0.4.8" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60422bc7fefa2f3ec70359b8ff1caff59d785877eb70595904605bcc412470f" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", "indexmap", "pin-project", - "rand 0.8.3", + "pin-project-lite", + "rand 0.8.5", "slab", "tokio", - "tokio-stream", - "tokio-util 0.6.6", + "tokio-util", "tower-layer", "tower-service", "tracing", ] +[[package]] +name = "tower-http" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.1" @@ -6551,9 +7207,9 @@ checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" [[package]] name = "tower-service" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" @@ -6598,6 +7254,20 @@ dependencies = [ "tracing", ] +[[package]] +name = "tracker" +version = "0.0.1" +dependencies = [ + "collections", + "crossbeam-utils 0.8.8", + "kvproto", + "lazy_static", + "parking_lot 0.12.1", + "pin-project", + "prometheus", + "slab", +] + [[package]] name = "try-lock" version = "0.2.2" @@ -6633,7 +7303,7 @@ dependencies = [ "kvproto", "log_wrappers", "panic_hook", - "rand 0.8.3", + "rand 0.8.5", "slog", "thiserror", "tikv_alloc", @@ -6642,9 +7312,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.12.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "ucd-trie" @@ -6667,6 +7337,12 @@ dependencies = [ "matches", ] +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + [[package]] name = "unicode-normalization" version = "0.1.12" @@ -6688,12 +7364,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20" -[[package]] -name = "unicode-xid" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" - [[package]] name = "untrusted" version = "0.7.1" @@ -6720,7 +7390,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "055058552ca15c566082fc61da433ae678f78986a6f16957e33162d1b218792a" dependencies = [ "kernel32-sys", - "libc 0.2.125", + "libc 0.2.139", "winapi 0.2.8", ] @@ -6734,6 +7404,15 @@ dependencies = [ "serde", ] +[[package]] +name = "uuid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" +dependencies = [ + "getrandom 0.2.3", +] + [[package]] name = "valgrind_request" version = "1.1.0" @@ -6760,9 +7439,9 @@ checksum = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" [[package]] name = "version_check" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "visible" @@ -6775,10 +7454,10 @@ dependencies = [ ] [[package]] -name = "void" -version = "1.0.2" +name = "waker-fn" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" [[package]] name = "walkdir" @@ -6813,6 +7492,12 @@ version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.79" @@ -6891,16 +7576,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab146130f5f790d45f82aeeb09e55a256573373ec64409fc19a6fb82fb1032ae" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "which" version = "4.2.4" @@ -6909,7 +7584,7 @@ checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" dependencies = [ "either", "lazy_static", - "libc 0.2.125", + "libc 0.2.139", ] [[package]] @@ -6961,43 +7636,100 @@ version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3df6e476185f92a12c072be4a189a0210dcdcf512a1891d6dff9edb874deadc6" dependencies = [ - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_msvc", + "windows_aarch64_msvc 0.32.0", + "windows_i686_gnu 0.32.0", + "windows_i686_msvc 0.32.0", + "windows_x86_64_gnu 0.32.0", + "windows_x86_64_msvc 0.32.0", ] +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc 0.42.0", + "windows_i686_gnu 0.42.0", + "windows_i686_msvc 0.42.0", + "windows_x86_64_gnu 0.42.0", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc 0.42.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" + [[package]] name = "windows_aarch64_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8e92753b1c443191654ec532f14c199742964a061be25d77d7a96f09db20bf5" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" + [[package]] name = "windows_i686_gnu" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a711c68811799e017b6038e0922cb27a5e2f43a2ddb609fe0b6f3eeda9de615" +[[package]] +name = "windows_i686_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" + [[package]] name = "windows_i686_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "146c11bb1a02615db74680b32a68e2d61f553cc24c4eb5b4ca10311740e44172" +[[package]] +name = "windows_i686_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" + [[package]] name = "windows_x86_64_gnu" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c912b12f7454c6620635bbff3450962753834be2a594819bd5e945af18ec64bc" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" + [[package]] name = "windows_x86_64_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "504a2476202769977a040c6364301a3f65d0cc9e3fb08600b2bda150a0488316" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" + [[package]] name = "winreg" version = "0.7.0" @@ -7047,16 +7779,18 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#5f3d58002b383bfd0014e271ae58261ecc072de3" +source = "git+https://github.com/tikv/yatp.git?branch=master#5523a9a6a4d0d6242bdb02b0a344f7ee1477b39b" dependencies = [ "crossbeam-deque", + "crossbeam-skiplist", + "crossbeam-utils 0.8.8", "dashmap", "fail", "lazy_static", "num_cpus", "parking_lot_core 0.9.1", "prometheus", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] @@ -7074,14 +7808,31 @@ dependencies = [ "rand 0.7.3", ] +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc 0.2.139", + "zstd-sys", +] + [[package]] name = "zstd-sys" -version = "1.4.19+zstd.1.4.8" +version = "2.0.1+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec24a9273d24437afb8e71b16f3d9a5d569193cccdb7896213b59f552f387674" +checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" dependencies = [ "cc", - "glob", - "itertools 0.9.0", - "libc 0.2.125", + "libc 0.2.139", ] diff --git a/Cargo.toml b/Cargo.toml index 61759a4b68a..5363de8bd59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.1.0-alpha" +version = "7.2.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" @@ -13,6 +13,7 @@ publish = false [features] default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +trace-tablet-lifetime = ["engine_rocks/trace-lifetime"] tcmalloc = ["tikv_alloc/tcmalloc"] jemalloc = ["tikv_alloc/jemalloc", "engine_rocks/jemalloc"] mimalloc = ["tikv_alloc/mimalloc"] @@ -38,7 +39,7 @@ cloud-azure = [ "encryption_export/cloud-azure", "sst_importer/cloud-azure", ] -testexport = ["raftstore/testexport", "api_version/testexport"] +testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport"] test-engine-kv-rocksdb = [ "engine_test/test-engine-kv-rocksdb" ] @@ -64,120 +65,130 @@ name = "tikv" [dependencies] anyhow = "1.0" -api_version = { path = "components/api_version", default-features = false } +api_version = { workspace = true } async-stream = "0.2" async-trait = "0.1" backtrace = "0.3" -batch-system = { path = "components/batch-system", default-features = false } +batch-system = { workspace = true } byteorder = "1.2" -case_macros = { path = "components/case_macros" } -causal_ts = { path = "components/causal_ts" } +case_macros = { workspace = true } +causal_ts = { workspace = true } chrono = "0.4" -codec = { path = "components/codec", default-features = false } -collections = { path = "components/collections" } -concurrency_manager = { path = "components/concurrency_manager", default-features = false } -coprocessor_plugin_api = { path = "components/coprocessor_plugin_api" } +codec = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } +coprocessor_plugin_api = { workspace = true } crc32fast = "1.2" crc64fast = "0.1" crossbeam = "0.8" -encryption_export = { path = "components/encryption/export", default-features = false } -engine_panic = { path = "components/engine_panic", default-features = false } -engine_rocks = { path = "components/engine_rocks", default-features = false } -engine_test = { path = "components/engine_test", default-features = false } -engine_traits = { path = "components/engine_traits", default-features = false } -engine_traits_tests = { path = "components/engine_traits_tests", default-features = false } -error_code = { path = "components/error_code", default-features = false } +dashmap = "5" +encryption_export = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +engine_traits_tests = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "components/file_system", default-features = false } +file_system = { workspace = true } flate2 = { version = "1.0", default-features = false, features = ["zlib"] } futures = { version = "0.3", features = ["thread-pool", "compat"] } futures-executor = "0.3.1" futures-timer = "3.0" futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +fxhash = "0.2.1" +getset = "0.1" +grpcio = { workspace = true } +grpcio-health = { workspace = true } hex = "0.4" http = "0" hyper = { version = "0.14", features = ["full"] } hyper-tls = "0.5" -into_other = { path = "components/into_other", default-features = false } +into_other = { workspace = true } itertools = "0.10" -keys = { path = "components/keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +keyed_priority_queue = "0.4" +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" libloading = "0.7" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "components/log_wrappers" } -match_template = { path = "components/match_template" } -memory_trace_macros = { path = "components/memory_trace_macros" } +log_wrappers = { workspace = true } +match-template = "0.0.1" +memory_trace_macros = { workspace = true } mime = "0.3.13" more-asserts = "0.2" -murmur3 = "0.5.1" +mur3 = "0.1" nom = { version = "5.1.0", default-features = false, features = ["std"] } notify = "4" num-traits = "0.2.14" num_cpus = "1" -online_config = { path = "components/online_config" } +online_config = { workspace = true } openssl = "0.10" parking_lot = "0.12" paste = "1.0" -pd_client = { path = "components/pd_client", default-features = false } +pd_client = { workspace = true } pin-project = "1.0" pnet_datalink = "0.23" -pprof = { git = "https://github.com/tikv/pprof-rs.git", rev = "3fed55af8fc6cf69dbd954a0321c799c5a111e4e", default-features = false, features = ["flamegraph", "protobuf-codec"] } +pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec"] } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raft_log_engine = { path = "components/raft_log_engine", default-features = false } -raftstore = { path = "components/raftstore", default-features = false } +raft_log_engine = { workspace = true } +raftstore = { workspace = true, features = ["engine_rocks"] } +raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" -resource_metering = { path = "components/resource_metering" } +resource_control = { workspace = true } +resource_metering = { workspace = true } rev_lines = "0.2.1" seahash = "4.1.0" -security = { path = "components/security", default-features = false } +security = { workspace = true } semver = "0.11" serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" serde_ignored = "0.1" -serde_json = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -sst_importer = { path = "components/sst_importer", default-features = false } +serde_json = { version = "1.0", features = ["preserve_order"] } +slog = { workspace = true } +slog-global = { workspace = true } +smallvec = "1.4" +sst_importer = { workspace = true } strum = { version = "0.20", features = ["derive"] } -sysinfo = "0.16" +sync_wrapper = "0.1.1" +sysinfo = "0.26" tempfile = "3.0" thiserror = "1.0" -tidb_query_aggr = { path = "components/tidb_query_aggr", default-features = false } -tidb_query_common = { path = "components/tidb_query_common", default-features = false } -tidb_query_datatype = { path = "components/tidb_query_datatype", default-features = false } -tidb_query_executors = { path = "components/tidb_query_executors", default-features = false } -tidb_query_expr = { path = "components/tidb_query_expr", default-features = false } -tikv_alloc = { path = "components/tikv_alloc" } -tikv_kv = { path = "components/tikv_kv", default-features = false } -tikv_util = { path = "components/tikv_util", default-features = false } +tidb_query_aggr = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tidb_query_executors = { workspace = true } +tidb_query_expr = { workspace = true } +tikv_alloc = { workspace = true } +tikv_kv = { workspace = true } +tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } tokio = { version = "1.17", features = ["full"] } tokio-openssl = "0.6" -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } toml = "0.5" -txn_types = { path = "components/txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } url = "2" uuid = { version = "0.8.1", features = ["serde", "v4"] } walkdir = "2" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] -api_version = { path = "components/api_version", features = ["testexport"] } -example_plugin = { path = "components/test_coprocessor_plugin/example_plugin" } # should be a binary dependency +api_version = { workspace = true, features = ["testexport"] } +example_coprocessor_plugin = { workspace = true } # should be a binary dependency hyper-openssl = "0.9" -panic_hook = { path = "components/panic_hook" } +panic_hook = { workspace = true } +raftstore = { workspace = true, features = ["testexport"] } reqwest = { version = "0.11", features = ["blocking"] } -test_sst_importer = { path = "components/test_sst_importer", default-features = false } -test_util = { path = "components/test_util", default-features = false } +test_sst_importer = { workspace = true } +test_util = { workspace = true } tokio = { version = "1.17", features = ["macros", "rt-multi-thread", "time"] } zipf = "6.1.0" @@ -196,16 +207,21 @@ rusoto_mock = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr rusoto_s3 = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_sts = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } +snappy-sys = { git = "https://github.com/busyjay/rust-snappy.git", branch = "static-link" } + # remove this when https://github.com/danburkert/fs2-rs/pull/42 is merged. fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } +# Remove this when a new version is release. We need to solve rust-lang/cmake-rs#143. +cmake = { git = "https://github.com/rust-lang/cmake-rs" } + [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. # [patch.'https://github.com/pingcap/kvproto'] -# kvproto = {git = "https://github.com/your_github_id/kvproto", branch="your_branch"} +# kvproto = { git = "https://github.com/your_github_id/kvproto", branch = "your_branch" } [workspace] # See https://github.com/rust-lang/rfcs/blob/master/text/2957-cargo-features2.md @@ -229,11 +245,13 @@ members = [ "components/codec", "components/collections", "components/concurrency_manager", - "components/concurrency_manager", "components/coprocessor_plugin_api", "components/encryption", "components/encryption/export", "components/engine_rocks_helper", +# Only enable tirocks in local development, otherwise it can slow down compilation. +# TODO: always enable tirocks and remove engine_rocks. +# "components/engine_tirocks", "components/error_code", "components/external_storage", "components/external_storage/export", @@ -241,21 +259,27 @@ members = [ "components/into_other", "components/keys", "components/log_wrappers", - "components/match_template", "components/online_config", "components/panic_hook", "components/pd_client", + "components/profiler", "components/raftstore", + "components/raftstore-v2", "components/resolved_ts", + "components/resource_control", "components/resource_metering", + "components/security", "components/server", - "components/server", + "components/snap_recovery", "components/sst_importer", "components/test_backup", "components/test_coprocessor", "components/test_coprocessor_plugin/example_plugin", "components/test_pd", + "components/test_pd_client", "components/test_raftstore", + "components/test_raftstore-v2", + "components/test_raftstore_macro", "components/test_sst_importer", "components/test_storage", "components/test_util", @@ -266,8 +290,10 @@ members = [ "components/tidb_query_executors", "components/tidb_query_expr", "components/tikv_alloc", + "components/tikv_kv", "components/tikv_util", "components/tipb_helper", + "components/tracker", "components/txn_types", "fuzz", "fuzz/fuzzer-afl", @@ -277,9 +303,109 @@ members = [ ] default-members = ["cmd/tikv-server", "cmd/tikv-ctl"] +[workspace.dependencies] +api_version = { path = "components/api_version" } +aws = { path = "components/cloud/aws" } +azure = { path = "components/cloud/azure" } +backup = { path = "components/backup", default-features = false } +backup-stream = { path = "components/backup-stream", default-features = false } +batch-system = { path = "components/batch-system" } +case_macros = { path = "components/case_macros" } +causal_ts = { path = "components/causal_ts" } +cdc = { path = "components/cdc", default-features = false } +cloud = { path = "components/cloud" } +codec = { path = "components/codec" } +collections = { path = "components/collections" } +concurrency_manager = { path = "components/concurrency_manager" } +coprocessor_plugin_api = { path = "components/coprocessor_plugin_api" } +encryption = { path = "components/encryption" } +encryption_export = { path = "components/encryption/export" } +engine_panic = { path = "components/engine_panic" } +engine_rocks = { path = "components/engine_rocks" } +engine_rocks_helper = { path = "components/engine_rocks_helper" } +engine_test = { path = "components/engine_test", default-features = false } +engine_traits = { path = "components/engine_traits" } +engine_traits_tests = { path = "components/engine_traits_tests", default-features = false } +error_code = { path = "components/error_code" } +external_storage = { path = "components/external_storage" } +external_storage_export = { path = "components/external_storage/export" } +file_system = { path = "components/file_system" } +gcp = { path = "components/cloud/gcp" } +into_other = { path = "components/into_other" } +keys = { path = "components/keys" } +log_wrappers = { path = "components/log_wrappers" } +memory_trace_macros = { path = "components/memory_trace_macros" } +online_config = { path = "components/online_config" } +panic_hook = { path = "components/panic_hook" } +pd_client = { path = "components/pd_client" } +profiler = { path = "components/profiler" } +raft_log_engine = { path = "components/raft_log_engine" } +raftstore = { path = "components/raftstore", default-features = false } +raftstore-v2 = { path = "components/raftstore-v2", default-features = false } +resolved_ts = { path = "components/resolved_ts" } +resource_control = { path = "components/resource_control" } +resource_metering = { path = "components/resource_metering" } +security = { path = "components/security" } +server = { path = "components/server" } +snap_recovery = { path = "components/snap_recovery" } +sst_importer = { path = "components/sst_importer" } +test_backup = { path = "components/test_backup" } +test_coprocessor = { path = "components/test_coprocessor", default-features = false } +example_coprocessor_plugin = { path = "components/test_coprocessor_plugin/example_plugin" } +test_pd = { path = "components/test_pd" } +test_pd_client = { path = "components/test_pd_client" } +test_raftstore = { path = "components/test_raftstore", default-features = false } +test_raftstore-v2 = { path = "components/test_raftstore-v2", default-features = false } +test_raftstore_macro = { path = "components/test_raftstore_macro" } +test_sst_importer = { path = "components/test_sst_importer" } +test_storage = { path = "components/test_storage", default-features = false } +test_util = { path = "components/test_util" } +tidb_query_aggr = { path = "components/tidb_query_aggr" } +tidb_query_codegen = { path = "components/tidb_query_codegen" } +tidb_query_common = { path = "components/tidb_query_common" } +tidb_query_datatype = { path = "components/tidb_query_datatype" } +tidb_query_executors = { path = "components/tidb_query_executors" } +tidb_query_expr = { path = "components/tidb_query_expr" } +tikv = { path = ".", default-features = false } +tikv_alloc = { path = "components/tikv_alloc" } +tikv_kv = { path = "components/tikv_kv", default-features = false } +tikv_util = { path = "components/tikv_util" } +tipb_helper = { path = "components/tipb_helper" } +tracker = { path = "components/tracker" } +txn_types = { path = "components/txn_types" } +# External libs +grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } +grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } +tipb = { git = "https://github.com/pingcap/tipb.git" } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } + +[profile.dev.package.grpcio-sys] +debug = false +opt-level = 1 + +[profile.dev.package.librocksdb_sys] +debug = false +opt-level = 1 + +[profile.dev.package.libtitan_sys] +debug = false +opt-level = 1 + +[profile.dev.package.tirocks-sys] +debug = false +opt-level = 1 + +[profile.dev.package.tests] +debug = 1 +opt-level = 1 + [profile.dev] opt-level = 0 -debug = true +debug = 0 codegen-units = 4 lto = false incremental = true @@ -305,7 +431,7 @@ codegen-units = 4 [profile.test] opt-level = 0 -debug = true +debug = 0 codegen-units = 16 lto = false incremental = true diff --git a/Dockerfile b/Dockerfile index eca69ce3b8d..aefa51b2222 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,11 @@ RUN ln -s /usr/bin/cmake3 /usr/bin/cmake ENV LIBRARY_PATH /usr/local/lib:$LIBRARY_PATH ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH +# Install protoc +RUN curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip" +RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/ +ENV PATH /usr/local/bin/:$PATH + # Install Rustup RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y ENV PATH /root/.cargo/bin/:$PATH @@ -72,8 +77,7 @@ RUN mkdir -p ./cmd/tikv-ctl/src ./cmd/tikv-server/src && \ echo 'fn main() {}' > ./cmd/tikv-ctl/src/main.rs && \ echo 'fn main() {}' > ./cmd/tikv-server/src/main.rs && \ for cargotoml in $(find . -type f -name "Cargo.toml"); do \ - sed -i '/fuzz/d' ${cargotoml} && \ - sed -i '/profiler/d' ${cargotoml} ; \ + sed -i '/fuzz/d' ${cargotoml} ; \ done COPY Makefile ./ @@ -105,6 +109,10 @@ FROM pingcap/alpine-glibc COPY --from=builder /tikv/target/release/tikv-server /tikv-server COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl +# FIXME: Figure out why libstdc++ is not staticly linked. +RUN apk add --no-cache \ + curl libstdc++ + EXPOSE 20160 20180 ENTRYPOINT ["/tikv-server"] diff --git a/Makefile b/Makefile index a41055f7430..6e8cada8b6f 100644 --- a/Makefile +++ b/Makefile @@ -311,6 +311,14 @@ run: # Run tests under a variety of conditions. This should pass before # submitting pull requests. test: + ./scripts/test-all -- --nocapture + +# Run tests with nextest. +ifndef CUSTOM_TEST_COMMAND +test_with_nextest: export CUSTOM_TEST_COMMAND=nextest run +endif +test_with_nextest: export RUSTDOCFLAGS="-Z unstable-options --persist-doctests" +test_with_nextest: ./scripts/test-all ## Static analysis @@ -322,11 +330,11 @@ unset-override: pre-format: unset-override @rustup component add rustfmt - @cargo install -q cargo-sort + @which cargo-sort &> /dev/null || cargo install -q cargo-sort format: pre-format @cargo fmt - @cargo sort -w ./Cargo.toml ./*/Cargo.toml components/*/Cargo.toml cmd/*/Cargo.toml >/dev/null + @cargo sort -w -c &>/dev/null || cargo sort -w >/dev/null doc: @cargo doc --workspace --document-private-items \ @@ -339,6 +347,7 @@ pre-clippy: unset-override clippy: pre-clippy @./scripts/check-redact-log @./scripts/check-docker-build + @./scripts/check-license @./scripts/clippy-all pre-audit: diff --git a/README.md b/README.md index b9a2d9d9519..4b3e7e6c397 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Coverage Status](https://codecov.io/gh/tikv/tikv/branch/master/graph/badge.svg)](https://codecov.io/gh/tikv/tikv) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2574/badge)](https://bestpractices.coreinfrastructure.org/projects/2574) -TiKV is an open-source, distributed, and transactional key-value database. Unlike other traditional NoSQL systems, TiKV not only provides classical key-value APIs, but also transactional APIs with ACID compliance. Built in Rust and powered by Raft, TiKV was originally created to complement [TiDB](https://github.com/pingcap/tidb), a distributed HTAP database compatible with the MySQL protocol. +TiKV is an open-source, distributed, and transactional key-value database. Unlike other traditional NoSQL systems, TiKV not only provides classical key-value APIs, but also transactional APIs with ACID compliance. Built in Rust and powered by Raft, TiKV was originally created by [PingCAP](https://en.pingcap.com) to complement [TiDB](https://github.com/pingcap/tidb), a distributed HTAP database compatible with the MySQL protocol. The design of TiKV ('Ti' stands for titanium) is inspired by some great distributed systems from Google, such as BigTable, Spanner, and Percolator, and some of the latest achievements in academia in recent years, such as the Raft consensus algorithm. @@ -134,10 +134,6 @@ See [CONTRIBUTING.md](./CONTRIBUTING.md). ## Client drivers -Currently, the interfaces to TiKV are the [TiDB Go client](https://github.com/pingcap/tidb/tree/master/store/tikv) and the [TiSpark Java client](https://github.com/pingcap/tispark/tree/master/tikv-client/src/main/java/com/pingcap/tikv). - -These are the clients for TiKV: - - [Go](https://github.com/tikv/client-go) (The most stable and widely used) - [Java](https://github.com/tikv/client-java) - [Rust](https://github.com/tikv/client-rust) @@ -155,7 +151,7 @@ A third-party security auditing was performed by Cure53. See the full report [he To report a security vulnerability, please send an email to [TiKV-security](mailto:tikv-security@lists.cncf.io) group. -See [Security](./security/SECURITY.md) for the process and policy followed by the TiKV project. +See [Security](SECURITY.md) for the process and policy followed by the TiKV project. ## Communication diff --git a/security/SECURITY.md b/SECURITY.md similarity index 98% rename from security/SECURITY.md rename to SECURITY.md index 353a70f039f..30be9e0daf0 100644 --- a/security/SECURITY.md +++ b/SECURITY.md @@ -18,6 +18,8 @@ The following are the versions that we support for security updates | Version | Supported | | ------- | ------------------ | +| 6.x | :white_check_mark: | +| 5.x | :white_check_mark: | | 4.x | :white_check_mark: | | 3.x | :white_check_mark: | | 2.x | :white_check_mark: | @@ -94,4 +96,4 @@ IvCICV7zG1cyuM/Z2Y7/TJ+upvahP46nM3s3G15b8FYuTSmRN1Kp9+mBt2BHqOy1 ulx+VF4Lf9n3ydf593Nha9bMJ/rnSp01 =XbYK -----END PGP PUBLIC KEY BLOCK----- -``` \ No newline at end of file +``` diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 00000000000..1530b3cb60b --- /dev/null +++ b/clippy.toml @@ -0,0 +1,11 @@ +disallowed-methods = [ + { path = "std::thread::Builder::spawn", reason = "Wrapper function `::spawn_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + + { path = "tokio::runtime::builder::Builder::on_thread_start", reason = "Wrapper function `::after_start_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + { path = "tokio::runtime::builder::Builder::on_thread_stop", reason = "Wrapper function `::before_stop_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + + { path = "futures_executor::thread_pool::ThreadPoolBuilder::after_start", reason = "Wrapper function `::after_start_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + { path = "futures_executor::thread_pool::ThreadPoolBuilder::before_stop", reason = "Wrapper function `::before_stop_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, +] +avoid-breaking-exported-api = false +upper-case-acronyms-aggressive = true diff --git a/cmd/build.rs b/cmd/build.rs index ef751a71feb..c19797d9227 100644 --- a/cmd/build.rs +++ b/cmd/build.rs @@ -32,7 +32,9 @@ fn link_sys_lib(lib: &str, tool: &cc::Tool) { } // remove lib prefix and .a postfix. let libname = &lib[3..lib.len() - 2]; - println!("cargo:rustc-link-lib=static={}", &libname); + // Get around the issue "the linking modifiers `+bundle` and `+whole-archive` + // are not compatible with each other when generating rlibs" + println!("cargo:rustc-link-lib=static:-bundle,+whole-archive={}", &libname); println!( "cargo:rustc-link-search=native={}", path.parent().unwrap().display() diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 9292df06fca..718d760e3d4 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -45,54 +45,53 @@ test-engines-panic = [ nortcheck = ["engine_rocks/nortcheck"] [dependencies] -backup = { path = "../../components/backup", default-features = false } -cdc = { path = "../../components/cdc", default-features = false } +backup = { workspace = true } +cdc = { workspace = true } chrono = "0.4" clap = "2.32" -collections = { path = "../../components/collections" } -concurrency_manager = { path = "../../components/concurrency_manager", default-features = false } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -encryption_export = { path = "../../components/encryption/export", default-features = false } -engine_rocks = { path = "../../components/engine_rocks", default-features = false } -engine_traits = { path = "../../components/engine_traits", default-features = false } -error_code = { path = "../../components/error_code", default-features = false } -file_system = { path = "../../components/file_system", default-features = false } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +file_system = { workspace = true } futures = "0.3" gag = "1.0" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } hex = "0.4" -keys = { path = "../../components/keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +keys = { workspace = true } +kvproto = { workspace = true } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../../components/log_wrappers" } -nix = "0.23" -pd_client = { path = "../../components/pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git" } -raft_log_engine = { path = "../../components/raft_log_engine", default-features = false } -raftstore = { path = "../../components/raftstore", default-features = false } +raft_log_engine = { workspace = true } +raftstore = { workspace = true } rand = "0.8" regex = "1" -security = { path = "../../components/security", default-features = false } +security = { workspace = true } serde_json = "1.0" -server = { path = "../../components/server" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +server = { workspace = true } +slog = { workspace = true } +slog-global = { workspace = true } structopt = "0.3" tempfile = "3.0" -tikv = { path = "../../", default-features = false } -tikv_alloc = { path = "../../components/tikv_alloc" } -tikv_util = { path = "../../components/tikv_util", default-features = false } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } toml = "0.5" -txn_types = { path = "../../components/txn_types", default-features = false } +txn_types = { workspace = true } [build-dependencies] cc = "1.0" time = "0.1" [target.'cfg(unix)'.dependencies] -signal = "0.6" +signal-hook = "0.3" diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index a1934c1acb8..42678386f5a 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -1,13 +1,13 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{borrow::ToOwned, lazy::SyncLazy, str, string::ToString, u64}; +use std::{borrow::ToOwned, str, string::ToString, sync::LazyLock, u64}; use clap::{crate_authors, AppSettings}; use engine_traits::CF_DEFAULT; use structopt::StructOpt; const RAW_KEY_HINT: &str = "Raw key (generally starts with \"z\") in escaped form"; -static VERSION_INFO: SyncLazy = SyncLazy::new(|| { +static VERSION_INFO: LazyLock = LazyLock::new(|| { let build_timestamp = option_env!("TIKV_BUILD_TIME"); tikv::tikv_version_info(build_timestamp) }); @@ -373,7 +373,8 @@ pub enum Cmd { /// Skip write RocksDB read_only: bool, }, - /// Unsafely recover when the store can not start normally, this recover may lose data + /// Unsafely recover when the store can not start normally, this recover may + /// lose data UnsafeRecover { #[structopt(subcommand)] cmd: UnsafeRecoverCmd, @@ -404,7 +405,9 @@ pub enum Cmd { default_value = crate::executor::METRICS_PROMETHEUS, possible_values = &["prometheus", "jemalloc", "rocksdb_raft", "rocksdb_kv"], )] - /// Set the metrics tag, one of prometheus/jemalloc/rocksdb_raft/rocksdb_kv, if not specified, print prometheus + /// Set the metrics tag + /// Options: prometheus/jemalloc/rocksdb_raft/rocksdb_kv + /// If not specified, print prometheus tag: Vec, }, /// Force a consistency-check for a specified region @@ -415,10 +418,13 @@ pub enum Cmd { }, /// Get all regions with corrupt raft BadRegions {}, - /// Modify tikv config, eg. tikv-ctl --host ip:port modify-tikv-config -n rocksdb.defaultcf.disable-auto-compactions -v true + /// Modify tikv config. + /// Eg. tikv-ctl --host ip:port modify-tikv-config -n + /// rocksdb.defaultcf.disable-auto-compactions -v true ModifyTikvConfig { #[structopt(short = "n")] - /// The config name are same as the name used on config file, eg. raftstore.messages-per-tick, raftdb.max-background-jobs + /// The config name are same as the name used on config file. + /// eg. raftstore.messages-per-tick, raftdb.max-background-jobs config_name: String, #[structopt(short = "v")] @@ -431,7 +437,8 @@ pub enum Cmd { /// Output meta file path file: String, }, - /// Compact the whole cluster in a specified range in one or more column families + /// Compact the whole cluster in a specified range in one or more column + /// families CompactCluster { #[structopt( short = "d", @@ -449,7 +456,8 @@ pub enum Cmd { default_value = CF_DEFAULT, possible_values = &["default", "lock", "write"], )] - /// Column family names, for kv db, combine from default/lock/write; for raft db, can only be default + /// Column family names, for kv db, combine from default/lock/write; for + /// raft db, can only be default cf: Vec, #[structopt( @@ -529,16 +537,32 @@ pub enum Cmd { #[structopt(subcommand)] cmd: EncryptionMetaCmd, }, + /// Delete encryption keys that are no longer associated with physical + /// files. + CleanupEncryptionMeta {}, /// Print bad ssts related infos BadSsts { #[structopt(long)] - /// specify manifest, if not set, it will look up manifest file in db path + /// specify manifest, if not set, it will look up manifest file in db + /// path manifest: Option, #[structopt(long, value_delimiter = ",")] /// PD endpoints pd: String, }, + /// Reset data in a TiKV to a certain version + ResetToVersion { + #[structopt(short = "v")] + /// The version to reset TiKV to + version: u64, + }, + /// Control for Raft Engine + /// Usage: tikv-ctl raft-engine-ctl -- --help + RaftEngineCtl { + #[structopt(last = true)] + args: Vec, + }, #[structopt(external_subcommand)] External(Vec), } @@ -562,13 +586,14 @@ pub enum RaftCmd { help = RAW_KEY_HINT, )] key: Option, + #[structopt(short = "b")] + binary: bool, }, /// print region info Region { #[structopt( short = "r", aliases = &["region"], - required_unless = "all-regions", conflicts_with = "all-regions", use_delimiter = true, require_delimiter = true, @@ -580,10 +605,22 @@ pub enum RaftCmd { // `regions` must be None when `all_regions` is present, // so we left `all_regions` unused. #[allow(dead_code)] - #[structopt(long, required_unless = "regions", conflicts_with = "regions")] + #[structopt(long, conflicts_with = "regions")] /// Print info for all regions all_regions: bool, + #[structopt(long, default_value = "")] + /// hex start key + start: String, + + #[structopt(long, default_value = "")] + /// hex end key + end: String, + + #[structopt(long, default_value = "16")] + /// Limit the number of keys to scan + limit: usize, + #[structopt(long)] /// Skip tombstone regions skip_tombstone: bool, @@ -594,7 +631,8 @@ pub enum RaftCmd { pub enum FailCmd { /// Inject failures Inject { - /// Inject fail point and actions pairs. E.g. tikv-ctl fail inject a=off b=panic + /// Inject fail point and actions pairs. + /// E.g. tikv-ctl fail inject a=off b=panic args: Vec, #[structopt(short = "f")] diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 96b322936bc..df095e44425 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -1,22 +1,18 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::ToOwned, cmp::Ordering, path::PathBuf, pin::Pin, str, string::ToString, sync::Arc, - time::Duration, u64, + borrow::ToOwned, cmp::Ordering, pin::Pin, str, string::ToString, sync::Arc, time::Duration, u64, }; use encryption_export::data_key_manager_from_config; -use engine_rocks::{ - raw_util::{db_exist, new_engine_opt}, - RocksEngine, -}; +use engine_rocks::util::{db_exist, new_engine_opt}; use engine_traits::{ Engines, Error as EngineError, RaftEngine, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, }; use futures::{executor::block_on, future, stream, Stream, StreamExt, TryStreamExt}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ - debugpb::{Db as DBType, *}, + debugpb::{Db as DbType, *}, kvrpcpb::MvccInfo, metapb::{Peer, Region}, raft_cmdpb::RaftCmdRequest, @@ -26,12 +22,15 @@ use pd_client::{Config as PdConfig, PdClient, RpcClient}; use protobuf::Message; use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; use raft_log_engine::RaftLogEngine; -use raftstore::store::INIT_EPOCH_CONF_VER; +use raftstore::store::{util::build_key_range, INIT_EPOCH_CONF_VER}; use security::SecurityManager; use serde_json::json; use tikv::{ - config::{ConfigController, TiKvConfig}, - server::debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + config::{ConfigController, TikvConfig}, + server::{ + debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + KvEngineFactoryBuilder, + }, }; use tikv_util::escape; @@ -46,9 +45,8 @@ pub const LOCK_FILE_ERROR: &str = "IO error: While lock file"; type MvccInfoStream = Pin, MvccInfo), String>>>>; pub fn new_debug_executor( - cfg: &TiKvConfig, + cfg: &TikvConfig, data_dir: Option<&str>, - skip_paranoid_checks: bool, host: Option<&str>, mgr: Arc, ) -> Box { @@ -58,38 +56,31 @@ pub fn new_debug_executor( // TODO: perhaps we should allow user skip specifying data path. let data_dir = data_dir.unwrap(); - let kv_path = cfg.infer_kv_engine_path(Some(data_dir)).unwrap(); let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); - let cache = cfg.storage.block_cache.build_shared_cache(); - let shared_block_cache = cache.is_some(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); let env = cfg - .build_shared_rocks_env(key_manager.clone(), None /*io_rate_limiter*/) + .build_shared_rocks_env(key_manager.clone(), None /* io_rate_limiter */) .unwrap(); - let mut kv_db_opts = cfg.rocksdb.build_opt(); - kv_db_opts.set_env(env.clone()); - kv_db_opts.set_paranoid_checks(!skip_paranoid_checks); - let kv_cfs_opts = cfg - .rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version()); - let kv_path = PathBuf::from(kv_path).canonicalize().unwrap(); - let kv_path = kv_path.to_str().unwrap(); - let kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { + let factory = KvEngineFactoryBuilder::new(env.clone(), cfg, cache) + .lite(true) + .build(); + let kv_db = match factory.create_shared_db(data_dir) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - let mut kv_db = RocksEngine::from_db(Arc::new(kv_db)); - kv_db.set_shared_block_cache(shared_block_cache); let cfg_controller = ConfigController::default(); if !cfg.raft_engine.enable { - let mut raft_db_opts = cfg.raftdb.build_opt(); - raft_db_opts.set_env(env); - let raft_db_cf_opts = cfg.raftdb.build_cf_opts(&cache); + let raft_db_opts = cfg.raftdb.build_opt(env, None); + let raft_db_cf_opts = cfg.raftdb.build_cf_opts(factory.block_cache()); let raft_path = cfg.infer_raft_db_path(Some(data_dir)).unwrap(); if !db_exist(&raft_path) { error!("raft db not exists: {}", raft_path); @@ -99,8 +90,6 @@ pub fn new_debug_executor( Ok(db) => db, Err(e) => handle_engine_error(e), }; - let mut raft_db = RocksEngine::from_db(Arc::new(raft_db)); - raft_db.set_shared_block_cache(shared_block_cache); let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box } else { @@ -110,7 +99,7 @@ pub fn new_debug_executor( error!("raft engine not exists: {}", config.dir); tikv_util::logger::exit_process_gracefully(-1); } - let raft_db = RaftLogEngine::new(config, key_manager, None /*io_rate_limiter*/).unwrap(); + let raft_db = RaftLogEngine::new(config, key_manager, None /* io_rate_limiter */).unwrap(); let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box } @@ -156,17 +145,38 @@ pub trait DebugExecutor { println!("total region size: {}", convert_gbmb(total_size as u64)); } - fn dump_region_info(&self, region_ids: Option>, skip_tombstone: bool) { + fn dump_region_info( + &self, + region_ids: Option>, + start_key: &[u8], + end_key: &[u8], + limit: usize, + skip_tombstone: bool, + ) { let region_ids = region_ids.unwrap_or_else(|| self.get_all_regions_in_store()); let mut region_objects = serde_json::map::Map::new(); for region_id in region_ids { + if limit > 0 && region_objects.len() >= limit { + break; + } let r = self.get_region_info(region_id); if skip_tombstone { let region_state = r.region_local_state.as_ref(); if region_state.map_or(false, |s| s.get_state() == PeerState::Tombstone) { - return; + continue; } } + let region = r + .region_local_state + .as_ref() + .map(|s| s.get_region().clone()) + .unwrap(); + if !check_intersect_of_range( + &build_key_range(region.get_start_key(), region.get_end_key(), false), + &build_key_range(start_key, end_key, false), + ) { + continue; + } let region_object = json!({ "region_id": region_id, "region_local_state": r.region_local_state.map(|s| { @@ -223,7 +233,7 @@ pub trait DebugExecutor { ); } - fn dump_raft_log(&self, region: u64, index: u64) { + fn dump_raft_log(&self, region: u64, index: u64, binary: bool) { let idx_key = keys::raft_log_key(region, index); println!("idx_key: {}", escape(&idx_key)); println!("region: {}", region); @@ -238,6 +248,11 @@ pub trait DebugExecutor { return; } + if binary { + println!("data: \n{}", hex::encode_upper(&data)); + return; + } + match entry.get_entry_type() { EntryType::EntryNormal => { let mut msg = RaftCmdRequest::default(); @@ -364,10 +379,10 @@ pub trait DebugExecutor { region: u64, to_host: Option<&str>, to_data_dir: Option<&str>, - to_config: &TiKvConfig, + to_config: &TikvConfig, mgr: Arc, ) { - let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, false, to_host, mgr); + let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, to_host, mgr); let r1 = self.get_region_info(region); let r2 = rhs_debug_executor.get_region_info(region); @@ -469,7 +484,7 @@ pub trait DebugExecutor { fn compact( &self, address: Option<&str>, - db: DBType, + db: DbType, cf: &str, from: Option>, to: Option>, @@ -492,7 +507,7 @@ pub trait DebugExecutor { fn compact_region( &self, address: Option<&str>, - db: DBType, + db: DbType, cf: &str, region_id: u64, threads: u32, @@ -609,7 +624,7 @@ pub trait DebugExecutor { fn do_compaction( &self, - db: DBType, + db: DbType, cf: &str, from: &[u8], to: &[u8], @@ -654,7 +669,7 @@ impl DebugExecutor for DebugClient { fn get_value_by_key(&self, cf: &str, key: Vec) -> Vec { let mut req = GetRequest::default(); - req.set_db(DBType::Kv); + req.set_db(DbType::Kv); req.set_cf(cf.to_owned()); req.set_key(key); self.get(&req) @@ -723,7 +738,7 @@ impl DebugExecutor for DebugClient { fn do_compaction( &self, - db: DBType, + db: DbType, cf: &str, from: &[u8], to: &[u8], @@ -863,7 +878,7 @@ impl DebugExecutor for Debugger { } fn get_value_by_key(&self, cf: &str, key: Vec) -> Vec { - self.get(DBType::Kv, cf, &key) + self.get(DbType::Kv, cf, &key) .unwrap_or_else(|e| perror_and_exit("Debugger::get", e)) } @@ -871,7 +886,7 @@ impl DebugExecutor for Debugger { self.region_size(region, cfs) .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) .into_iter() - .map(|(cf, size)| (cf.to_owned(), size as usize)) + .map(|(cf, size)| (cf.to_owned(), size)) .collect() } @@ -907,7 +922,7 @@ impl DebugExecutor for Debugger { fn do_compaction( &self, - db: DBType, + db: DbType, cf: &str, from: &[u8], to: &[u8], @@ -1090,8 +1105,8 @@ impl DebugExecutor for Debugger { fn handle_engine_error(err: EngineError) -> ! { error!("error while open kvdb: {}", err); - if let EngineError::Engine(msg) = err { - if msg.starts_with(LOCK_FILE_ERROR) { + if let EngineError::Engine(s) = err { + if s.state().contains(LOCK_FILE_ERROR) { error!( "LOCK file conflict indicates TiKV process is running. \ Do NOT delete the LOCK file and force the command to run. \ diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 8ada0c7a426..f547a2cee3a 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -23,8 +23,8 @@ use std::{ }; use encryption_export::{ - create_backend, data_key_manager_from_config, encryption_method_from_db_encryption_method, - DataKeyManager, DecrypterReader, Iv, + create_backend, data_key_manager_from_config, from_engine_encryption_method, DataKeyManager, + DecrypterReader, Iv, }; use engine_rocks::get_env; use engine_traits::EncryptionKeyManager; @@ -33,7 +33,7 @@ use futures::executor::block_on; use gag::BufferRedirect; use grpcio::{CallOption, ChannelBuilder, Environment}; use kvproto::{ - debugpb::{Db as DBType, *}, + debugpb::{Db as DbType, *}, encryptionpb::EncryptionMethod, kvrpcpb::SplitRegionRequest, raft_serverpb::SnapshotMeta, @@ -41,11 +41,12 @@ use kvproto::{ }; use pd_client::{Config as PdConfig, PdClient, RpcClient}; use protobuf::Message; +use raft_log_engine::ManagedFileSystem; use regex::Regex; use security::{SecurityConfig, SecurityManager}; use structopt::{clap::ErrorKind, StructOpt}; -use tikv::{config::TiKvConfig, server::debug::BottommostLevelCompaction}; -use tikv_util::{escape, run_and_wait_child_process, unescape}; +use tikv::{config::TikvConfig, server::debug::BottommostLevelCompaction}; +use tikv_util::{escape, run_and_wait_child_process, sys::thread::StdThreadBuildWrapper, unescape}; use txn_types::Key; use crate::{cmd::*, executor::*, util::*}; @@ -58,14 +59,16 @@ fn main() { // Initialize configuration and security manager. let cfg_path = opt.config.as_ref(); - let cfg = cfg_path.map_or_else( + let mut cfg = cfg_path.map_or_else( || { - let mut cfg = TiKvConfig::default(); - cfg.log.level = tikv_util::logger::get_level_by_string("warn").unwrap(); + let mut cfg = TikvConfig::default(); + cfg.log.level = tikv_util::logger::get_level_by_string("warn") + .unwrap() + .into(); cfg }, |path| { - let s = fs::read_to_string(&path).unwrap(); + let s = fs::read_to_string(path).unwrap(); toml::from_str(&s).unwrap() }, ); @@ -99,10 +102,19 @@ fn main() { match args[0].as_str() { "ldb" => run_ldb_command(args, &cfg), "sst_dump" => run_sst_dump_command(args, &cfg), - "raft-engine-ctl" => run_raft_engine_ctl_command(args), _ => Opt::clap().print_help().unwrap(), } } + Cmd::RaftEngineCtl { args } => { + let key_manager = + data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) + .expect("data_key_manager_from_config should success"); + let file_system = Arc::new(ManagedFileSystem::new( + key_manager.map(|m| Arc::new(m)), + None, + )); + raft_engine_ctl::run_command(args, file_system); + } Cmd::BadSsts { manifest, pd } => { let data_dir = opt.data_dir.as_deref(); assert!(data_dir.is_some(), "--data-dir must be specified"); @@ -139,7 +151,7 @@ fn main() { let infile1 = Path::new(infile).canonicalize().unwrap(); let file_info = key_manager.get_file(infile1.to_str().unwrap()).unwrap(); - let mthd = encryption_method_from_db_encryption_method(file_info.method); + let mthd = from_engine_encryption_method(file_info.method); if mthd == EncryptionMethod::Plaintext { println!( "{} is not encrypted, skip to decrypt it into {}", @@ -157,7 +169,7 @@ fn main() { .unwrap(); let iv = Iv::from_slice(&file_info.iv).unwrap(); - let f = File::open(&infile).unwrap(); + let f = File::open(infile).unwrap(); let mut reader = DecrypterReader::new(f, mthd, &file_info.key, iv).unwrap(); io::copy(&mut reader, &mut outf).unwrap(); @@ -184,6 +196,19 @@ fn main() { DataKeyManager::dump_file_dict(&cfg.storage.data_dir, path.as_deref()).unwrap(); } }, + Cmd::CleanupEncryptionMeta {} => { + let key_manager = + match data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) + .expect("data_key_manager_from_config should success") + { + Some(mgr) => mgr, + None => { + println!("Encryption is disabled"); + return; + } + }; + key_manager.retain_encrypted_files(|fname| Path::new(fname).exists()) + } Cmd::CompactCluster { db, cf, @@ -193,7 +218,7 @@ fn main() { bottommost, } => { let pd_client = get_pd_rpc_client(opt.pd, Arc::clone(&mgr)); - let db_type = if db == "kv" { DBType::Kv } else { DBType::Raft }; + let db_type = if db == "kv" { DbType::Kv } else { DbType::Raft }; let cfs = cf.iter().map(|s| s.as_ref()).collect(); let from_key = from.map(|k| unescape(&k)); let to_key = to.map(|k| unescape(&k)); @@ -224,9 +249,8 @@ fn main() { .exit(); } - let skip_paranoid_checks = opt.skip_paranoid_checks; - let debug_executor = - new_debug_executor(&cfg, data_dir, skip_paranoid_checks, host, Arc::clone(&mgr)); + cfg.rocksdb.paranoid_checks = Some(!opt.skip_paranoid_checks); + let debug_executor = new_debug_executor(&cfg, data_dir, host, Arc::clone(&mgr)); match cmd { Cmd::Print { cf, key } => { @@ -234,7 +258,12 @@ fn main() { debug_executor.dump_value(&cf, key); } Cmd::Raft { cmd: subcmd } => match subcmd { - RaftCmd::Log { region, index, key } => { + RaftCmd::Log { + region, + index, + key, + binary, + } => { let (id, index) = if let Some(key) = key.as_deref() { keys::decode_raft_log_key(&unescape(key)).unwrap() } else { @@ -242,14 +271,25 @@ fn main() { let index = index.unwrap(); (id, index) }; - debug_executor.dump_raft_log(id, index); + debug_executor.dump_raft_log(id, index, binary); } RaftCmd::Region { regions, skip_tombstone, + start, + end, + limit, .. } => { - debug_executor.dump_region_info(regions, skip_tombstone); + let start_key = from_hex(&start).unwrap(); + let end_key = from_hex(&end).unwrap(); + debug_executor.dump_region_info( + regions, + &start_key, + &end_key, + limit, + skip_tombstone, + ); } }, Cmd::Size { region, cf } => { @@ -307,8 +347,8 @@ fn main() { } => { let to_data_dir = to_data_dir.as_deref(); let to_host = to_host.as_deref(); - let to_config = to_config.map_or_else(TiKvConfig::default, |path| { - let s = fs::read_to_string(&path).unwrap(); + let to_config = to_config.map_or_else(TikvConfig::default, |path| { + let s = fs::read_to_string(path).unwrap(); toml::from_str(&s).unwrap() }); debug_executor.diff_region(region, to_host, to_data_dir, &to_config, mgr); @@ -322,7 +362,7 @@ fn main() { threads, bottommost, } => { - let db_type = if db == "kv" { DBType::Kv } else { DBType::Raft }; + let db_type = if db == "kv" { DbType::Kv } else { DbType::Raft }; let from_key = from.map(|k| unescape(&k)); let to_key = to.map(|k| unescape(&k)); let bottommost = BottommostLevelCompaction::from(Some(bottommost.as_ref())); @@ -479,6 +519,7 @@ fn main() { Cmd::Cluster {} => { debug_executor.dump_cluster_info(); } + Cmd::ResetToVersion { version } => debug_executor.reset_to_version(version), _ => { unreachable!() } @@ -582,9 +623,9 @@ fn split_region(pd_client: &RpcClient, mgr: Arc, region_id: u64 fn compact_whole_cluster( pd_client: &RpcClient, - cfg: &TiKvConfig, + cfg: &TikvConfig, mgr: Arc, - db_type: DBType, + db_type: DbType, cfs: Vec<&str>, from: Option>, to: Option>, @@ -604,9 +645,9 @@ fn compact_whole_cluster( let cfs: Vec = cfs.iter().map(|cf| cf.to_string()).collect(); let h = thread::Builder::new() .name(format!("compact-{}", addr)) - .spawn(move || { + .spawn_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); - let debug_executor = new_debug_executor(&cfg, None, false, Some(&addr), mgr); + let debug_executor = new_debug_executor(&cfg, None, Some(&addr), mgr); for cf in cfs { debug_executor.compact( Some(&addr), @@ -645,27 +686,24 @@ fn read_fail_file(path: &str) -> Vec<(String, String)> { list } -fn run_ldb_command(args: Vec, cfg: &TiKvConfig) { +fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); - let env = get_env(key_manager, None /*io_rate_limiter*/).unwrap(); - let mut opts = cfg.rocksdb.build_opt(); - opts.set_env(env); - - engine_rocks::raw::run_ldb_tool(&args, &opts); + let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); + let resource = cfg.rocksdb.build_resources(env); + cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } -fn run_sst_dump_command(args: Vec, cfg: &TiKvConfig) { - let opts = cfg.rocksdb.build_opt(); - engine_rocks::raw::run_sst_dump_tool(&args, &opts); +fn run_ldb_command(args: Vec, cfg: &TikvConfig) { + engine_rocks::raw::run_ldb_tool(&args, &build_rocks_opts(cfg)); } -fn run_raft_engine_ctl_command(args: Vec) { - raft_engine_ctl::run_command(args); +fn run_sst_dump_command(args: Vec, cfg: &TikvConfig) { + engine_rocks::raw::run_sst_dump_tool(&args, &build_rocks_opts(cfg)); } -fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TiKvConfig) { +fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TikvConfig) { let db = &cfg.infer_kv_engine_path(Some(data_dir)).unwrap(); println!( "\nstart to print bad ssts; data_dir:{}; db:{}", @@ -681,7 +719,7 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, let stderr = BufferRedirect::stderr().unwrap(); let stdout = BufferRedirect::stdout().unwrap(); - let opts = cfg.rocksdb.build_opt(); + let opts = build_rocks_opts(cfg); match run_and_wait_child_process(|| engine_rocks::raw::run_sst_dump_tool(&args, &opts)) { Ok(code) => { @@ -713,7 +751,9 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, for line in corruptions.lines() { println!("--------------------------------------------------------"); // The corruption format may like this: + // ```text // /path/to/db/057155.sst is corrupted: Corruption: block checksum mismatch: expected 3754995957, got 708533950 in /path/to/db/057155.sst offset 3126049 size 22724 + // ``` println!("corruption info:\n{}", line); let r = Regex::new(r"/\w*\.sst").unwrap(); @@ -773,8 +813,10 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, println!("\nsst meta:"); // The output may like this: + // ```text // --------------- Column family "write" (ID 2) -------------- // 63:132906243[3555338 .. 3555338]['7A311B40EFCC2CB4C5911ECF3937D728DED26AE53FA5E61BE04F23F2BE54EACC73' seq:3555338, type:1 .. '7A313030302E25CD5F57252E' seq:3555338, type:1] at level 0 + // ``` let column_r = Regex::new(r"--------------- (.*) --------------\n(.*)").unwrap(); if let Some(m) = column_r.captures(&output) { println!( @@ -826,7 +868,8 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, println!("unexpected key {}", log_wrappers::Value(&start)); } } else { - // it is expected when the sst is output of a compaction and the sst isn't added to manifest yet. + // it is expected when the sst is output of a compaction and the sst isn't added + // to manifest yet. println!( "sst {} is not found in manifest: {}", sst_file_number, output diff --git a/cmd/tikv-ctl/src/util.rs b/cmd/tikv-ctl/src/util.rs index c776f16f83d..0e67c905e8d 100644 --- a/cmd/tikv-ctl/src/util.rs +++ b/cmd/tikv-ctl/src/util.rs @@ -2,15 +2,16 @@ use std::{borrow::ToOwned, error::Error, str, str::FromStr, u64}; +use kvproto::kvrpcpb::KeyRange; use server::setup::initial_logger; -use tikv::config::TiKvConfig; +use tikv::config::TikvConfig; const LOG_DIR: &str = "./ctl-engine-info-log"; #[allow(clippy::field_reassign_with_default)] pub fn init_ctl_logger(level: &str) { - let mut cfg = TiKvConfig::default(); - cfg.log.level = slog::Level::from_str(level).unwrap(); + let mut cfg = TikvConfig::default(); + cfg.log.level = slog::Level::from_str(level).unwrap().into(); cfg.rocksdb.info_log_dir = LOG_DIR.to_owned(); cfg.raftdb.info_log_dir = LOG_DIR.to_owned(); initial_logger(&cfg); @@ -62,8 +63,27 @@ pub fn perror_and_exit(prefix: &str, e: E) -> ! { tikv_util::logger::exit_process_gracefully(-1); } +// Check if region's `key_range` intersects with `key_range_limit`. +pub fn check_intersect_of_range(key_range: &KeyRange, key_range_limit: &KeyRange) -> bool { + if !key_range.get_end_key().is_empty() + && !key_range_limit.get_start_key().is_empty() + && key_range.get_end_key() <= key_range_limit.get_start_key() + { + return false; + } + if !key_range_limit.get_end_key().is_empty() + && !key_range.get_start_key().is_empty() + && key_range_limit.get_end_key() < key_range.get_start_key() + { + return false; + } + true +} + #[cfg(test)] mod tests { + use raftstore::store::util::build_key_range; + use super::*; #[test] @@ -73,4 +93,42 @@ mod tests { assert_eq!(from_hex("0x74").unwrap(), result); assert_eq!(from_hex("0X74").unwrap(), result); } + + #[test] + fn test_included_region_in_range() { + // To avoid unfolding the code when `make format` is called + fn range(start: &[u8], end: &[u8]) -> KeyRange { + build_key_range(start, end, false) + } + let mut region = range(&[0x02], &[0x05]); + // region absolutely in range + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x06]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x05]))); + // region intersects with range + assert!(check_intersect_of_range(®ion, &range(&[0x04], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x04], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x03]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x03]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x02]))); // region is left-closed and right-open interval + // range absolutely in region also need to return true + assert!(check_intersect_of_range(®ion, &range(&[0x03], &[0x04]))); + // region not intersects with range + assert!(!check_intersect_of_range(®ion, &range(&[0x05], &[]))); // region is left-closed and right-open interval + assert!(!check_intersect_of_range(®ion, &range(&[0x06], &[]))); + assert!(!check_intersect_of_range(®ion, &range(&[], &[0x01]))); + // check last region + region = range(&[0x02], &[]); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x02]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[]))); + assert!(!check_intersect_of_range(®ion, &range(&[], &[0x01]))); + } } diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index e2f594cd8ad..4bba926a68e 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -7,6 +7,7 @@ publish = false [features] default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +trace-tablet-lifetime = ["tikv/trace-tablet-lifetime"] tcmalloc = ["server/tcmalloc"] jemalloc = ["server/jemalloc"] mimalloc = ["server/mimalloc"] @@ -32,8 +33,9 @@ pprof-fp = ["tikv/pprof-fp"] [dependencies] clap = "2.32" -server = { path = "../../components/server", default-features = false } -tikv = { path = "../../", default-features = false } +serde_json = { version = "1.0", features = ["preserve_order"] } +server = { workspace = true } +tikv = { workspace = true } toml = "0.5" [build-dependencies] diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 4cb68c6e020..e64afdf1868 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -5,8 +5,12 @@ use std::{path::Path, process}; use clap::{crate_authors, App, Arg}; +use serde_json::{Map, Value}; use server::setup::{ensure_no_unrecognized_config, validate_and_persist_config}; -use tikv::config::TiKvConfig; +use tikv::{ + config::{to_flatten_config_info, TikvConfig}, + storage::config::EngineType, +}; fn main() { let build_timestamp = option_env!("TIKV_BUILD_TIME"); @@ -32,6 +36,15 @@ fn main() { .takes_value(false) .help("Check config file validity and exit"), ) + .arg( + Arg::with_name("config-info") + .required(false) + .long("config-info") + .takes_value(true) + .value_name("FORMAT") + .possible_values(&["json"]) + .help("print configuration information with specified format") + ) .arg( Arg::with_name("log-level") .short("L") @@ -147,7 +160,7 @@ fn main() { .get_matches(); if matches.is_present("print-sample-config") { - let config = TiKvConfig::default(); + let config = TikvConfig::default(); println!("{}", toml::to_string_pretty(&config).unwrap()); process::exit(0); } @@ -157,9 +170,9 @@ fn main() { let mut config = matches .value_of_os("config") - .map_or_else(TiKvConfig::default, |path| { + .map_or_else(TikvConfig::default, |path| { let path = Path::new(path); - TiKvConfig::from_file( + TikvConfig::from_file( path, if is_config_check { Some(&mut unrecognized_keys) @@ -186,5 +199,26 @@ fn main() { process::exit(0) } - server::server::run_tikv(config); + let is_config_info = matches.is_present("config-info"); + if is_config_info { + let config_infos = to_flatten_config_info(&config); + let mut result = Map::new(); + result.insert("Component".into(), "TiKV Server".into()); + result.insert("Version".into(), tikv::tikv_build_version().into()); + result.insert("Parameters".into(), Value::Array(config_infos)); + println!("{}", serde_json::to_string_pretty(&result).unwrap()); + process::exit(0); + } + + // engine config needs to be validated + // so that it can adjust the engine type before too late + if let Err(e) = config.storage.validate_engine_type() { + println!("invalid storage.engine configuration: {}", e); + process::exit(1) + } + + match config.storage.engine { + EngineType::RaftKv => server::server::run_tikv(config), + EngineType::RaftKv2 => server::server2::run_tikv(config), + } } diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index b6ce4bf54d5..c80607145bd 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -9,14 +9,15 @@ testexport = [] [dependencies] bitflags = "1.0.1" -codec = { path = "../codec", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } -match_template = { path = "../match_template" } +codec = { workspace = true } +engine_traits = { workspace = true } +kvproto = { workspace = true } +log_wrappers = { workspace = true } +match-template = "0.0.1" thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } diff --git a/components/api_version/src/api_v1.rs b/components/api_version/src/api_v1.rs index 9267d1397c7..1530124d245 100644 --- a/components/api_version/src/api_v1.rs +++ b/components/api_version/src/api_v1.rs @@ -1,5 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +use tikv_util::box_err; + use super::*; impl KvFormat for ApiV1 { @@ -43,28 +45,18 @@ impl KvFormat for ApiV1 { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok(Key::from_encoded_slice(key)), - ApiVersion::V2 => { - debug_assert_eq!(ApiV2::parse_key_mode(key), KeyMode::Raw); - let (mut user_key, _) = ApiV2::decode_raw_key(&Key::from_encoded_slice(key), true)?; - user_key.remove(0); // remove first byte `RAW_KEY_PREFIX` - Ok(Self::encode_raw_key_owned(user_key, None)) - } + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), /* reject apiv2 -> apiv1 conversion */ } } fn convert_raw_user_key_range_version_from( src_api: ApiVersion, - mut start_key: Vec, - mut end_key: Vec, - ) -> (Vec, Vec) { + start_key: Vec, + end_key: Vec, + ) -> Result<(Vec, Vec)> { match src_api { - ApiVersion::V1 | ApiVersion::V1ttl => (start_key, end_key), - ApiVersion::V2 => { - // TODO: check raw key range after check_api_version_range is refactored. - start_key.remove(0); - end_key.remove(0); - (start_key, end_key) - } + ApiVersion::V1 | ApiVersion::V1ttl => Ok((start_key, end_key)), + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), /* reject apiv2 -> apiv1 conversion */ } } } diff --git a/components/api_version/src/api_v1ttl.rs b/components/api_version/src/api_v1ttl.rs index ce42a023273..2a2df6bfb33 100644 --- a/components/api_version/src/api_v1ttl.rs +++ b/components/api_version/src/api_v1ttl.rs @@ -1,9 +1,12 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::Result; -use tikv_util::codec::{ - number::{self, NumberEncoder}, - Error, +use tikv_util::{ + box_err, + codec::{ + number::{self, NumberEncoder}, + Error, + }, }; use super::*; @@ -67,28 +70,18 @@ impl KvFormat for ApiV1Ttl { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok(Key::from_encoded_slice(key)), - ApiVersion::V2 => { - debug_assert_eq!(ApiV2::parse_key_mode(key), KeyMode::Raw); - let (mut user_key, _) = ApiV2::decode_raw_key(&Key::from_encoded_slice(key), true)?; - user_key.remove(0); // remove first byte `RAW_KEY_PREFIX` - Ok(Self::encode_raw_key_owned(user_key, None)) - } + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), /* reject apiv2 -> apiv1ttl conversion */ } } fn convert_raw_user_key_range_version_from( src_api: ApiVersion, - mut start_key: Vec, - mut end_key: Vec, - ) -> (Vec, Vec) { + start_key: Vec, + end_key: Vec, + ) -> Result<(Vec, Vec)> { match src_api { - ApiVersion::V1 | ApiVersion::V1ttl => (start_key, end_key), - ApiVersion::V2 => { - // TODO: check raw key range after check_api_version_range is refactored. - start_key.remove(0); - end_key.remove(0); - (start_key, end_key) - } + ApiVersion::V1 | ApiVersion::V1ttl => Ok((start_key, end_key)), + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), /* reject apiv2 -> apiv1ttl conversion */ } } } diff --git a/components/api_version/src/api_v2.rs b/components/api_version/src/api_v2.rs index d12926cb39b..a56d5deac30 100644 --- a/components/api_version/src/api_v2.rs +++ b/components/api_version/src/api_v2.rs @@ -16,6 +16,8 @@ pub const RAW_KEY_PREFIX_END: u8 = RAW_KEY_PREFIX + 1; pub const TXN_KEY_PREFIX: u8 = b'x'; pub const TIDB_META_KEY_PREFIX: u8 = b'm'; pub const TIDB_TABLE_KEY_PREFIX: u8 = b't'; +pub const DEFAULT_KEY_SPACE_ID: [u8; 3] = [0, 0, 0]; // reserve 3 bytes for key space id. +pub const DEFAULT_KEY_SPACE_ID_END: [u8; 3] = [0, 0, 1]; pub const TIDB_RANGES: &[(&[u8], &[u8])] = &[ (&[TIDB_META_KEY_PREFIX], &[TIDB_META_KEY_PREFIX + 1]), @@ -48,7 +50,7 @@ impl KvFormat for ApiV2 { match key[0] { RAW_KEY_PREFIX => KeyMode::Raw, TXN_KEY_PREFIX => KeyMode::Txn, - TIDB_META_KEY_PREFIX | TIDB_TABLE_KEY_PREFIX => KeyMode::TiDB, + TIDB_META_KEY_PREFIX | TIDB_TABLE_KEY_PREFIX => KeyMode::Tidb, _ => KeyMode::Unknown, } } @@ -141,8 +143,8 @@ impl KvFormat for ApiV2 { } // Note: `user_key` may not be `KeyMode::Raw`. - // E.g., `raw_xxx_range` interfaces accept an exclusive end key just beyond the scope of raw keys. - // The validity is ensured by client & Storage interfaces. + // E.g. `raw_xxx_range` interfaces accept an exclusive end key just beyond the + // scope of raw keys. The validity is ensured by client & Storage interfaces. fn encode_raw_key(user_key: &[u8], ts: Option) -> Key { let encoded_key = Key::from_raw(user_key); if let Some(ts) = ts { @@ -154,13 +156,14 @@ impl KvFormat for ApiV2 { } // Note: `user_key` may not be `KeyMode::Raw`. - // E.g., `raw_xxx_range` interfaces accept an exclusive end key just beyond the scope of raw keys. - // The validity is ensured by client & Storage interfaces. + // E.g. `raw_xxx_range` interfaces accept an exclusive end key just beyond the + // scope of raw keys. The validity is ensured by client & Storage interfaces. fn encode_raw_key_owned(mut user_key: Vec, ts: Option) -> Key { let src_len = user_key.len(); let encoded_len = MemComparableByteCodec::encoded_len(src_len); - // always reserve more U64_SIZE for ts, as it's likely to "append_ts" later, especially in raw write procedures. + // always reserve more U64_SIZE for ts, as it's likely to "append_ts" later, + // especially in raw write procedures. user_key.reserve(encoded_len - src_len + number::U64_SIZE); user_key.resize(encoded_len, 0u8); MemComparableByteCodec::encode_all_in_place(&mut user_key, src_len); @@ -182,9 +185,7 @@ impl KvFormat for ApiV2 { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => { - let mut apiv2_key = Vec::with_capacity(ApiV2::get_encode_len(key.len() + 1)); - apiv2_key.push(RAW_KEY_PREFIX); - apiv2_key.extend(key); + let apiv2_key = ApiV2::add_prefix(key, &DEFAULT_KEY_SPACE_ID); Ok(Self::encode_raw_key_owned(apiv2_key, ts)) } ApiVersion::V2 => Ok(Key::from_encoded_slice(key)), @@ -195,18 +196,18 @@ impl KvFormat for ApiV2 { src_api: ApiVersion, mut start_key: Vec, mut end_key: Vec, - ) -> (Vec, Vec) { + ) -> Result<(Vec, Vec)> { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => { - start_key.insert(0, RAW_KEY_PREFIX); + start_key = ApiV2::add_prefix(&start_key, &DEFAULT_KEY_SPACE_ID); if end_key.is_empty() { - end_key.insert(0, RAW_KEY_PREFIX_END); + end_key = ApiV2::add_prefix(&end_key, &DEFAULT_KEY_SPACE_ID_END); } else { - end_key.insert(0, RAW_KEY_PREFIX); + end_key = ApiV2::add_prefix(&end_key, &DEFAULT_KEY_SPACE_ID); } - (start_key, end_key) + Ok((start_key, end_key)) } - ApiVersion::V2 => (start_key, end_key), + ApiVersion::V2 => Ok((start_key, end_key)), } } } @@ -235,12 +236,21 @@ impl ApiV2 { Ok(Key::split_on_ts_for(key)?) } + pub fn add_prefix(key: &[u8], key_space: &[u8]) -> Vec { + let mut apiv2_key = + Vec::with_capacity(ApiV2::get_encode_len(key.len() + key_space.len() + 1)); + apiv2_key.push(RAW_KEY_PREFIX); + apiv2_key.extend(key_space); // Reserved 3 bytes for key space id. + apiv2_key.extend(key); + apiv2_key + } + pub const ENCODED_LOGICAL_DELETE: [u8; 1] = [ValueMeta::DELETE_FLAG.bits]; } // Note: `encoded_bytes` may not be `KeyMode::Raw`. -// E.g., backup service accept an exclusive end key just beyond the scope of raw keys. -// The validity is ensured by client & Storage interfaces. +// E.g., backup service accept an exclusive end key just beyond the scope of raw +// keys. The validity is ensured by client & Storage interfaces. #[inline] fn is_valid_encoded_bytes(mut encoded_bytes: &[u8], with_ts: bool) -> bool { bytes::decode_bytes(&mut encoded_bytes, false).is_ok() @@ -252,8 +262,8 @@ fn is_valid_encoded_key(encoded_key: &Key, with_ts: bool) -> bool { is_valid_encoded_bytes(encoded_key.as_encoded(), with_ts) } -/// TimeStamp::zero is not acceptable, as such entries can not be retrieved by RawKV MVCC. -/// See `RawMvccSnapshot::seek_first_key_value_cf`. +/// TimeStamp::zero is not acceptable, as such entries can not be retrieved by +/// RawKV MVCC. See `RawMvccSnapshot::seek_first_key_value_cf`. #[inline] fn is_valid_ts(ts: TimeStamp) -> bool { !ts.is_zero() diff --git a/components/api_version/src/keyspace.rs b/components/api_version/src/keyspace.rs new file mode 100644 index 00000000000..4b263822a1b --- /dev/null +++ b/components/api_version/src/keyspace.rs @@ -0,0 +1,163 @@ +use std::fmt::Debug; + +use engine_traits::{Error, Result}; +use tikv_util::box_err; + +use super::*; + +const KEYSPACE_PREFIX_LEN: usize = 4; + +pub trait KvPair { + fn key(&self) -> &[u8]; + fn value(&self) -> &[u8]; + fn kv(&self) -> (&[u8], &[u8]) { + (self.key(), self.value()) + } +} + +impl KvPair for (Vec, Vec) { + fn key(&self) -> &[u8] { + &self.0 + } + fn value(&self) -> &[u8] { + &self.1 + } +} + +pub trait Keyspace { + type KvPair: KvPair = (Vec, Vec); + fn make_kv_pair(p: (Vec, Vec)) -> Result; + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + Ok((None, key)) + } +} + +#[derive(PartialEq, Clone, Copy, Debug)] +pub struct KeyspaceId(u32); + +impl From for KeyspaceId { + fn from(id: u32) -> Self { + Self(id) + } +} + +impl Keyspace for ApiV1 { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV1Ttl { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV2 { + type KvPair = KeyspaceKv; + + fn make_kv_pair(p: (Vec, Vec)) -> Result { + let (k, v) = p; + let (keyspace, _) = Self::parse_keyspace(&k)?; + Ok(KeyspaceKv { + k, + v, + keyspace: keyspace.unwrap(), + }) + } + + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + let mode = ApiV2::parse_key_mode(key); + if key.len() < KEYSPACE_PREFIX_LEN || (mode != KeyMode::Raw && mode != KeyMode::Txn) { + return Err(Error::Other(box_err!( + "invalid API V2 key: {}", + log_wrappers::Value(key) + ))); + } + let id = u32::from_be_bytes([0, key[1], key[2], key[3]]); + Ok((Some(KeyspaceId::from(id)), &key[KEYSPACE_PREFIX_LEN..])) + } +} + +pub struct KeyspaceKv { + k: Vec, + v: Vec, + keyspace: KeyspaceId, +} + +impl KvPair for KeyspaceKv { + fn key(&self) -> &[u8] { + &self.k[KEYSPACE_PREFIX_LEN..] + } + + fn value(&self) -> &[u8] { + &self.v + } +} + +impl KeyspaceKv { + pub fn keyspace(&self) -> KeyspaceId { + self.keyspace + } +} + +impl PartialEq<(Vec, Vec)> for KeyspaceKv { + fn eq(&self, other: &(Vec, Vec)) -> bool { + self.kv() == (&other.0, &other.1) + } +} + +impl PartialEq for KeyspaceKv { + fn eq(&self, other: &Self) -> bool { + self.k == other.k && self.v == other.v + } +} + +impl Debug for KeyspaceKv { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KeyspaceKv") + .field("key", &log_wrappers::Value(self.key())) + .field("value", &log_wrappers::Value(self.value())) + .field("keyspace", &self.keyspace()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_v1_parse_keyspace() { + let k = b"t123_111"; + let (keyspace, key) = ApiV1::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + + let (keyspace, key) = ApiV1Ttl::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + } + + #[test] + fn test_v2_parse_keyspace() { + let ok = vec![ + (b"x\x00\x00\x01t123_114", 1, b"t123_114"), + (b"r\x00\x00\x01t123_112", 1, b"t123_112"), + (b"x\x01\x00\x00t213_112", 0x010000, b"t213_112"), + (b"r\x01\x00\x00t123_113", 0x010000, b"t123_113"), + ]; + + for (key, id, user_key) in ok { + let (keyspace, key) = ApiV2::parse_keyspace(key).unwrap(); + assert_eq!(Some(KeyspaceId::from(id)), keyspace); + assert_eq!(user_key, key); + } + + let err: Vec<&[u8]> = vec![b"t123_111", b"s\x00\x00", b"r\x00\x00"]; + + for key in err { + ApiV2::parse_keyspace(key).unwrap_err(); + } + } +} diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index b57b1dfae45..879751e7b62 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -1,30 +1,36 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(min_specialization)] +#![feature(associated_type_defaults)] mod api_v1; mod api_v1ttl; pub mod api_v2; +pub mod keyspace; use engine_traits::Result; use kvproto::kvrpcpb::ApiVersion; pub use match_template::match_template; use txn_types::{Key, TimeStamp}; -pub trait KvFormat: Clone + Copy + 'static + Send + Sync { +use crate::keyspace::Keyspace; + +pub trait KvFormat: Keyspace + Clone + Copy + 'static + Send + Sync { const TAG: ApiVersion; /// Corresponding TAG of client requests. For test only. #[cfg(any(test, feature = "testexport"))] const CLIENT_TAG: ApiVersion; const IS_TTL_ENABLED: bool; - /// Parse the key prefix and infer key mode. It's safe to parse either raw key or encoded key. + /// Parse the key prefix and infer key mode. It's safe to parse either raw + /// key or encoded key. fn parse_key_mode(key: &[u8]) -> KeyMode; fn parse_range_mode(range: (Option<&[u8]>, Option<&[u8]>)) -> KeyMode; /// Parse from the bytes from storage. fn decode_raw_value(bytes: &[u8]) -> Result>; - /// This is equivalent to `decode_raw_value()` but returns the owned user value. + /// This is equivalent to `decode_raw_value()` but returns the owned user + /// value. fn decode_raw_value_owned(mut bytes: Vec) -> Result>> { let (len, expire_ts, is_delete) = { let raw_value = Self::decode_raw_value(&bytes)?; @@ -47,8 +53,8 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { /// This is equivalent to `encode_raw_value` but reduced an allocation. fn encode_raw_value_owned(value: RawValue>) -> Vec; - /// Parse from the txn_types::Key from storage. Default implementation for API V1|V1TTL. - /// Return: (user key, optional timestamp) + /// Parse from the txn_types::Key from storage. Default implementation for + /// API V1|V1TTL. Return: (user key, optional timestamp) fn decode_raw_key(encoded_key: &Key, _with_ts: bool) -> Result<(Vec, Option)> { Ok((encoded_key.as_encoded().clone(), None)) } @@ -59,7 +65,8 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { ) -> Result<(Vec, Option)> { Ok((encoded_key.into_encoded(), None)) } - /// Encode the user key & optional timestamp into txn_types::Key. Default implementation for API V1|V1TTL. + /// Encode the user key & optional timestamp into txn_types::Key. Default + /// implementation for API V1|V1TTL. fn encode_raw_key(user_key: &[u8], _ts: Option) -> Key { Key::from_encoded_slice(user_key) } @@ -80,7 +87,7 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { src_api: ApiVersion, start_key: Vec, end_key: Vec, - ) -> (Vec, Vec); + ) -> Result<(Vec, Vec)>; /// Convert the encoded value from src_api version to Self::TAG version fn convert_raw_encoded_value_version_from( @@ -138,7 +145,8 @@ macro_rules! match_template_api_version { }} } -/// Dispatch an expression with type `kvproto::kvrpcpb::ApiVersion` to corresponding concrete type of `KvFormat` +/// Dispatch an expression with type `kvproto::kvrpcpb::ApiVersion` to +/// corresponding concrete type of `KvFormat` /// /// For example, the following code /// @@ -172,7 +180,7 @@ macro_rules! dispatch_api_version { } /// The key mode inferred from the key prefix. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum KeyMode { /// Raw key. Raw, @@ -184,7 +192,7 @@ pub enum KeyMode { /// TiDB, but instead, it means that the key matches the definition of /// TiDB key in API V2, therefore, the key is treated as TiDB data in /// order to fulfill compatibility. - TiDB, + Tidb, /// Unrecognised key mode. Unknown, } @@ -197,8 +205,8 @@ pub enum KeyMode { /// /// ### ApiVersion::V1ttl /// -/// 8 bytes representing the unix timestamp in seconds for expiring time will be append -/// to the value of all RawKV kv pairs. +/// 8 bytes representing the unix timestamp in seconds for expiring time will be +/// append to the value of all RawKV kv pairs. /// /// ```text /// ------------------------------------------------------------ @@ -221,8 +229,8 @@ pub enum KeyMode { /// ``` /// /// As shown in the example below, the least significant bit of the meta flag -/// indicates whether the value contains 8 bytes expire ts at the very left to the -/// meta flags. +/// indicates whether the value contains 8 bytes expire ts at the very left to +/// the meta flags. /// /// ```text /// -------------------------------------------------------------------------------- @@ -231,11 +239,12 @@ pub enum KeyMode { /// | 0x12 0x34 0x56 | 0x00 0x00 0x00 0x00 0x00 0x00 0xff 0xff | 0x01 (0b00000001) | /// -------------------------------------------------------------------------------- /// ``` -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub struct RawValue> { /// The user value. pub user_value: T, - /// The unix timestamp in seconds indicating the point of time that this key will be deleted. + /// The unix timestamp in seconds indicating the point of time that this key + /// will be deleted. pub expire_ts: Option, /// Logical deletion flag in ApiV2, should be `false` in ApiV1 and ApiV1Ttl pub is_delete: bool, @@ -244,10 +253,13 @@ pub struct RawValue> { impl> RawValue { #[inline] pub fn is_valid(&self, current_ts: u64) -> bool { - !self.is_delete - && self - .expire_ts - .map_or(true, |expire_ts| expire_ts > current_ts) + !self.is_delete & !self.is_ttl_expired(current_ts) + } + + #[inline] + pub fn is_ttl_expired(&self, current_ts: u64) -> bool { + self.expire_ts + .map_or(false, |expire_ts| expire_ts <= current_ts) } } @@ -266,8 +278,8 @@ mod tests { ); assert_eq!(ApiV2::parse_key_mode(&[RAW_KEY_PREFIX]), KeyMode::Raw); assert_eq!(ApiV2::parse_key_mode(&[TXN_KEY_PREFIX]), KeyMode::Txn); - assert_eq!(ApiV2::parse_key_mode(&b"t_a"[..]), KeyMode::TiDB); - assert_eq!(ApiV2::parse_key_mode(&b"m"[..]), KeyMode::TiDB); + assert_eq!(ApiV2::parse_key_mode(&b"t_a"[..]), KeyMode::Tidb); + assert_eq!(ApiV2::parse_key_mode(&b"m"[..]), KeyMode::Tidb); assert_eq!(ApiV2::parse_key_mode(&b"ot"[..]), KeyMode::Unknown); } @@ -284,19 +296,19 @@ mod tests { ); assert_eq!( ApiV2::parse_range_mode((Some(b"t_a"), Some(b"t_z"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"t"), Some(b"u"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"m"), Some(b"n"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"m_a"), Some(b"m_z"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"x\0a"), Some(b"x\0z"))), @@ -482,22 +494,25 @@ mod tests { #[test] fn test_value_valid() { let cases = vec![ - // expire_ts, is_delete, expect_is_valid - (None, false, true), - (None, true, false), - (Some(5), false, false), - (Some(5), true, false), - (Some(100), false, true), - (Some(100), true, false), + // expire_ts, is_delete, expect_is_valid, expect_ttl_expired + (None, false, true, false), + (None, true, false, false), + (Some(5), false, false, true), + (Some(5), true, false, true), + (Some(100), false, true, false), + (Some(100), true, false, false), ]; - for (idx, (expire_ts, is_delete, expect_is_valid)) in cases.into_iter().enumerate() { + for (idx, (expire_ts, is_delete, expect_is_valid, ttl_expired)) in + cases.into_iter().enumerate() + { let raw_value = RawValue { user_value: b"value", expire_ts, is_delete, }; assert_eq!(raw_value.is_valid(10), expect_is_valid, "case {}", idx); + assert_eq!(raw_value.is_ttl_expired(10), ttl_expired, "case {}", idx); } } @@ -633,8 +648,8 @@ mod tests { .clone() .into_iter() .map(|key| { - let mut v2_key = key; - v2_key.insert(0, RAW_KEY_PREFIX); + let mut v2_key = vec![RAW_KEY_PREFIX, 0, 0, 0]; + v2_key.extend(key); ApiV2::encode_raw_key_owned(v2_key, Some(TimeStamp::from(timestamp))).into_encoded() }) .collect(); @@ -642,8 +657,6 @@ mod tests { let test_cases = vec![ (ApiVersion::V1, ApiVersion::V2, &apiv1_keys, &apiv2_keys), (ApiVersion::V1ttl, ApiVersion::V2, &apiv1_keys, &apiv2_keys), - (ApiVersion::V2, ApiVersion::V1, &apiv2_keys, &apiv1_keys), - (ApiVersion::V2, ApiVersion::V1ttl, &apiv2_keys, &apiv1_keys), ]; for i in 0..apiv1_keys.len() { for (src_api_ver, dst_api_ver, src_data, dst_data) in test_cases.clone() { @@ -731,14 +744,14 @@ mod tests { .clone() .into_iter() .map(|(start_key, end_key)| { - let mut v2_start_key = start_key; - let mut v2_end_key = end_key; - v2_start_key.insert(0, RAW_KEY_PREFIX); - if v2_end_key.is_empty() { - v2_end_key.insert(0, RAW_KEY_PREFIX_END); + let mut v2_start_key = vec![RAW_KEY_PREFIX, 0, 0, 0]; // key space takes 3 bytes. + let mut v2_end_key = if end_key.is_empty() { + vec![RAW_KEY_PREFIX, 0, 0, 1] } else { - v2_end_key.insert(0, RAW_KEY_PREFIX); - } + vec![RAW_KEY_PREFIX, 0, 0, 0] // key space takes 3 bytes. + }; + v2_start_key.extend(start_key); + v2_end_key.extend(end_key); (v2_start_key, v2_end_key) }) .collect(); @@ -756,18 +769,6 @@ mod tests { &apiv1_key_ranges, &apiv2_key_ranges, ), - ( - ApiVersion::V2, - ApiVersion::V1, - &apiv2_key_ranges, - &apiv1_key_ranges, - ), - ( - ApiVersion::V2, - ApiVersion::V1ttl, - &apiv2_key_ranges, - &apiv1_key_ranges, - ), ]; for (src_api_ver, dst_api_ver, src_data, dst_data) in test_cases { for i in 0..apiv1_key_ranges.len() { @@ -775,7 +776,7 @@ mod tests { let (src_start, src_end) = src_data[i].clone(); API::convert_raw_user_key_range_version_from(src_api_ver, src_start, src_end) }); - assert_eq!(dst_key_range, dst_data[i]); + assert_eq!(dst_key_range.unwrap(), dst_data[i]); } } } diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index f14c0aa3c39..005849391e9 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -8,9 +8,11 @@ default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] test-engines-rocksdb = ["tikv/test-engines-rocksdb"] -failpoints = ["tikv/failpoints", "fail/failpoints", "fail"] +failpoints = ["tikv/failpoints", "fail/failpoints"] backup-stream-debug = [] +metastore-etcd = ["tonic", "etcd-client"] + [[test]] name = "integration" path = "tests/mod.rs" @@ -19,59 +21,73 @@ test = true harness = true [dependencies] +async-compression = { version = "0.3.14", features = ["tokio", "zstd"] } async-trait = { version = "0.1" } bytes = "1" +cfg-if = "1" chrono = "0.4" -concurrency_manager = { path = "../concurrency_manager" } +concurrency_manager = { workspace = true } crossbeam = "0.8" crossbeam-channel = "0.5" dashmap = "5" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code" } -etcd-client = { version = "0.7", features = ["pub-response-field", "tls"] } -external_storage = { path = "../external_storage", default-features = false } -external_storage_export = { path = "../external_storage/export", default-features = false } -fail = { version = "0.5", optional = true } - -file_system = { path = "../file_system" } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +# We cannot update the etcd-client to latest version because of the cyclic requirement. +# Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. +etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "41d393c32a7a7c728550cee1d9a138dafe6f3e27", features = ["pub-response-field", "tls-openssl-vendored"], optional = true } +external_storage = { workspace = true } +external_storage_export = { workspace = true } +fail = "0.5" +file_system = { workspace = true } futures = "0.3" +futures-io = "0.3" +grpcio = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +# Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 +indexmap = "=1.6.2" +kvproto = { workspace = true } lazy_static = "1.4" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } +log_wrappers = { workspace = true } +online_config = { workspace = true } openssl = "0.10" -pd_client = { path = "../pd_client" } +pd_client = { workspace = true } +pin-project = "1.0" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } +raftstore = { workspace = true } +rand = "0.8.0" regex = "1" -resolved_ts = { path = "../resolved_ts" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +resolved_ts = { workspace = true } +security = { path = "../security" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1" -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util" } +tidb_query_datatype = { workspace = true } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_kv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["compat"] } -tonic = "0.5" -txn_types = { path = "../txn_types", default-features = false } +tonic = { version = "0.8", optional = true } +txn_types = { workspace = true } uuid = "0.8" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] async-trait = "0.1" -engine_panic = { path = "../engine_panic" } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +engine_panic = { workspace = true } +grpcio = { workspace = true } hex = "0.4" -rand = "0.8.0" +protobuf = { version = "2.8", features = ["bytes"] } tempdir = "0.3" -test_raftstore = { path = "../test_raftstore", default-features = false } -test_util = { path = "../test_util", default-features = false } +tempfile = "3.0" +test_pd = { workspace = true } +test_raftstore = { workspace = true } +test_util = { workspace = true } url = "2" walkdir = "2" diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs new file mode 100644 index 00000000000..d32c2ea7c00 --- /dev/null +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -0,0 +1,806 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, collections::HashMap, sync::Arc, time::Duration}; + +use futures::{ + channel::mpsc::{self as async_mpsc, Receiver, Sender}, + future::BoxFuture, + FutureExt, SinkExt, StreamExt, +}; +use grpcio::{RpcStatus, RpcStatusCode, WriteFlags}; +use kvproto::{ + errorpb::{Error as PbError, *}, + logbackuppb::{FlushEvent, SubscribeFlushEventResponse}, + metapb::Region, +}; +use pd_client::PdClient; +use tikv_util::{box_err, defer, info, time::Instant, warn, worker::Scheduler}; +use txn_types::TimeStamp; +use uuid::Uuid; + +use crate::{ + annotate, + errors::{Error, Result}, + future, + metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, + metrics, + subscription_track::ResolveResult, + try_send, RegionCheckpointOperation, Task, +}; + +/// A manager for maintaining the last flush ts. +/// This information is provided for the `advancer` in checkpoint V3, +/// which involved a central node (typically TiDB) for collecting all regions' +/// checkpoint then advancing the global checkpoint. +#[derive(Default)] +pub struct CheckpointManager { + checkpoint_ts: HashMap, + resolved_ts: HashMap, + manager_handle: Option>, +} + +impl std::fmt::Debug for CheckpointManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CheckpointManager") + .field("checkpoints", &self.checkpoint_ts) + .field("resolved-ts", &self.resolved_ts) + .finish() + } +} + +enum SubscriptionOp { + Add(Subscription), + Emit(Box<[FlushEvent]>), + #[cfg(test)] + Inspect(Box), +} + +pub struct SubscriptionManager { + subscribers: HashMap, + input: Receiver, +} + +impl SubscriptionManager { + pub async fn main_loop(mut self) { + info!("subscription manager started!"); + defer! { info!("subscription manager exit.") } + while let Some(msg) = self.input.next().await { + match msg { + SubscriptionOp::Add(sub) => { + let uid = Uuid::new_v4(); + info!("log backup adding new subscriber"; "id" => %uid); + self.subscribers.insert(uid, sub); + } + SubscriptionOp::Emit(events) => { + self.emit_events(events).await; + } + #[cfg(test)] + SubscriptionOp::Inspect(f) => { + f(&self); + } + } + } + // NOTE: Maybe close all subscription streams here. + } + + async fn emit_events(&mut self, events: Box<[FlushEvent]>) { + let mut canceled = vec![]; + info!("log backup sending events"; "event_len" => %events.len(), "downstream" => %self.subscribers.len()); + for (id, sub) in &mut self.subscribers { + let send_all = async { + for es in events.chunks(1024) { + let mut resp = SubscribeFlushEventResponse::new(); + resp.set_events(es.to_vec().into()); + sub.feed((resp, WriteFlags::default())).await?; + } + sub.flush().await + }; + + if let Err(err) = send_all.await { + canceled.push(*id); + Error::from(err).report("sending subscription"); + } + } + + for c in canceled { + self.remove_subscription(&c).await; + } + } + + async fn remove_subscription(&mut self, id: &Uuid) { + match self.subscribers.remove(id) { + Some(sub) => { + info!("client is gone, removing subscription"; "id" => %id); + // The stream is an endless stream -- we don't need to close it. + drop(sub); + } + None => { + warn!("BUG: the subscriber has been removed before we are going to remove it."; "id" => %id); + } + } + } +} + +// Note: can we make it more generic...? +#[cfg(not(test))] +pub type Subscription = + grpcio::ServerStreamingSink; + +#[cfg(test)] +pub type Subscription = tests::MockSink; + +/// The result of getting a checkpoint. +/// The possibility of failed to getting checkpoint is pretty high: +/// because there is a gap between region leader change and flushing. +#[derive(Debug)] +pub enum GetCheckpointResult { + Ok { + region: Region, + checkpoint: TimeStamp, + }, + NotFound { + id: RegionIdWithVersion, + err: PbError, + }, + EpochNotMatch { + region: Region, + err: PbError, + }, +} + +impl GetCheckpointResult { + /// create an "ok" variant with region. + pub fn ok(region: Region, checkpoint: TimeStamp) -> Self { + Self::Ok { region, checkpoint } + } + + fn not_found(id: RegionIdWithVersion) -> Self { + Self::NotFound { + id, + err: not_leader(id.region_id), + } + } + + /// create a epoch not match variant with region + fn epoch_not_match(provided: RegionIdWithVersion, real: &Region) -> Self { + Self::EpochNotMatch { + region: real.clone(), + err: epoch_not_match( + provided.region_id, + provided.region_epoch_version, + real.get_region_epoch().get_version(), + ), + } + } +} + +impl CheckpointManager { + pub fn spawn_subscription_mgr(&mut self) -> future![()] { + let (tx, rx) = async_mpsc::channel(1024); + let sub = SubscriptionManager { + subscribers: Default::default(), + input: rx, + }; + self.manager_handle = Some(tx); + sub.main_loop() + } + + pub fn resolve_regions(&mut self, region_and_checkpoint: Vec) { + for res in region_and_checkpoint { + self.do_update(res.region, res.checkpoint); + } + } + + pub fn flush(&mut self) { + info!("log backup checkpoint manager flushing."; "resolved_ts_len" => %self.resolved_ts.len(), "resolved_ts" => ?self.get_resolved_ts()); + self.checkpoint_ts = std::mem::take(&mut self.resolved_ts); + // Clippy doesn't know this iterator borrows `self.checkpoint_ts` :( + #[allow(clippy::needless_collect)] + let items = self + .checkpoint_ts + .values() + .cloned() + .map(|x| (x.region, x.checkpoint)) + .collect::>(); + self.notify(items.into_iter()); + } + + /// update a region checkpoint in need. + #[cfg(test)] + fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { + Self::update_ts(&mut self.checkpoint_ts, region.clone(), checkpoint) + } + + fn update_ts( + container: &mut HashMap, + region: Region, + checkpoint: TimeStamp, + ) { + let e = container.entry(region.get_id()); + let ver = region.get_region_epoch().get_version(); + // A hacky way to allow the two closures move out the region. + // It is safe given the two closures would only be called once. + let r = RefCell::new(Some(region)); + e.and_modify(|old_cp| { + let old_ver = old_cp.region.get_region_epoch().get_version(); + let checkpoint_is_newer = old_cp.checkpoint < checkpoint; + if old_ver < ver || (old_ver == ver && checkpoint_is_newer) { + *old_cp = LastFlushTsOfRegion { + checkpoint, + region: r.borrow_mut().take().expect( + "unreachable: `and_modify` and `or_insert_with` called at the same time.", + ), + }; + } + }) + .or_insert_with(|| LastFlushTsOfRegion { + checkpoint, + region: r + .borrow_mut() + .take() + .expect("unreachable: `and_modify` and `or_insert_with` called at the same time."), + }); + } + + pub fn add_subscriber(&mut self, sub: Subscription) -> BoxFuture<'static, Result<()>> { + let mgr = self.manager_handle.as_ref().cloned(); + let initial_data = self + .checkpoint_ts + .values() + .map(|v| FlushEvent { + start_key: v.region.start_key.clone(), + end_key: v.region.end_key.clone(), + checkpoint: v.checkpoint.into_inner(), + ..Default::default() + }) + .collect::>(); + + // NOTE: we cannot send the real error into the client directly because once + // we send the subscription into the sink, we cannot fetch it again :( + async move { + let mgr = mgr.ok_or(Error::Other(box_err!("subscription manager not get ready"))); + let mut mgr = match mgr { + Ok(mgr) => mgr, + Err(err) => { + sub.fail(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + "subscription manager not get ready.".to_owned(), + )) + .await + .map_err(|err| { + annotate!(err, "failed to send request to subscriber manager") + })?; + return Err(err); + } + }; + mgr.send(SubscriptionOp::Add(sub)) + .await + .map_err(|err| annotate!(err, "failed to send request to subscriber manager"))?; + mgr.send(SubscriptionOp::Emit(initial_data)) + .await + .map_err(|err| { + annotate!(err, "failed to send initial data to subscriber manager") + })?; + Ok(()) + } + .boxed() + } + + fn notify(&mut self, items: impl Iterator) { + if let Some(mgr) = self.manager_handle.as_mut() { + let r = items + .map(|(r, ts)| { + let mut f = FlushEvent::new(); + f.set_checkpoint(ts.into_inner()); + f.set_start_key(r.start_key); + f.set_end_key(r.end_key); + f + }) + .collect::>(); + let event_size = r.len(); + let res = mgr.try_send(SubscriptionOp::Emit(r)); + // Note: perhaps don't batch in the channel but batch in the receiver side? + // If so, we can control the memory usage better. + if let Err(err) = res { + warn!("the channel is full, dropping some events."; "length" => %event_size, "err" => %err); + } + } + } + + fn do_update(&mut self, region: Region, checkpoint: TimeStamp) { + Self::update_ts(&mut self.resolved_ts, region, checkpoint) + } + + /// get checkpoint from a region. + pub fn get_from_region(&self, region: RegionIdWithVersion) -> GetCheckpointResult { + let checkpoint = self.checkpoint_ts.get(®ion.region_id); + if checkpoint.is_none() { + return GetCheckpointResult::not_found(region); + } + let checkpoint = checkpoint.unwrap(); + if checkpoint.region.get_region_epoch().get_version() != region.region_epoch_version { + return GetCheckpointResult::epoch_not_match(region, &checkpoint.region); + } + GetCheckpointResult::ok(checkpoint.region.clone(), checkpoint.checkpoint) + } + + /// get all checkpoints stored. + pub fn get_all(&self) -> Vec { + self.checkpoint_ts.values().cloned().collect() + } + + pub fn get_resolved_ts(&self) -> Option { + self.resolved_ts.values().map(|x| x.checkpoint).min() + } + + #[cfg(test)] + fn sync_with_subs_mgr( + &mut self, + f: impl FnOnce(&SubscriptionManager) -> T + Send + 'static, + ) -> T { + use std::sync::Mutex; + + let (tx, rx) = std::sync::mpsc::sync_channel(1); + let t = Arc::new(Mutex::new(None)); + let tr = Arc::clone(&t); + self.manager_handle + .as_mut() + .unwrap() + .try_send(SubscriptionOp::Inspect(Box::new(move |x| { + *tr.lock().unwrap() = Some(f(x)); + tx.send(()).unwrap(); + }))) + .unwrap(); + rx.recv().unwrap(); + let mut t = t.lock().unwrap(); + t.take().unwrap() + } +} + +fn not_leader(r: u64) -> PbError { + let mut err = PbError::new(); + let mut nl = NotLeader::new(); + nl.set_region_id(r); + err.set_not_leader(nl); + err.set_message( + format!("the region {} isn't in the region_manager of log backup, maybe not leader or not flushed yet.", r)); + err +} + +fn epoch_not_match(id: u64, sent: u64, real: u64) -> PbError { + let mut err = PbError::new(); + let en = EpochNotMatch::new(); + err.set_epoch_not_match(en); + err.set_message(format!( + "the region {} has recorded version {}, but you sent {}", + id, real, sent, + )); + err +} + +#[derive(Debug, PartialEq, Hash, Clone, Copy)] +/// A simple region id, but versioned. +pub struct RegionIdWithVersion { + pub region_id: u64, + pub region_epoch_version: u64, +} + +impl RegionIdWithVersion { + pub fn new(id: u64, version: u64) -> Self { + Self { + region_id: id, + region_epoch_version: version, + } + } +} + +#[derive(Debug, Clone)] +pub struct LastFlushTsOfRegion { + pub region: Region, + pub checkpoint: TimeStamp, +} + +// Allow some type to +#[async_trait::async_trait] +pub trait FlushObserver: Send + 'static { + /// The callback when the flush has advanced the resolver. + async fn before(&mut self, checkpoints: Vec); + /// The callback when the flush is done. (Files are fully written to + /// external storage.) + async fn after(&mut self, task: &str, rts: u64) -> Result<()>; + /// The optional callback to rewrite the resolved ts of this flush. + /// Because the default method (collect all leader resolved ts in the store, + /// and use the minimal TS.) may lead to resolved ts rolling back, if we + /// desire a stronger consistency, we can rewrite a safer resolved ts here. + /// Note the new resolved ts cannot be greater than the old resolved ts. + async fn rewrite_resolved_ts( + &mut self, + #[allow(unused_variables)] _task: &str, + ) -> Option { + None + } +} + +pub struct BasicFlushObserver { + pd_cli: Arc, + store_id: u64, +} + +impl BasicFlushObserver { + pub fn new(pd_cli: Arc, store_id: u64) -> Self { + Self { pd_cli, store_id } + } +} + +#[async_trait::async_trait] +impl FlushObserver for BasicFlushObserver { + async fn before(&mut self, _checkpoints: Vec) {} + + async fn after(&mut self, task: &str, rts: u64) -> Result<()> { + if let Err(err) = self + .pd_cli + .update_service_safe_point( + format!("backup-stream-{}-{}", task, self.store_id), + TimeStamp::new(rts.saturating_sub(1)), + // Add a service safe point for 2 hours. + // We make it the same duration as we meet fatal errors because TiKV may be + // SIGKILL'ed after it meets fatal error and before it successfully updated the + // fatal error safepoint. + // TODO: We'd better make the coordinator, who really + // calculates the checkpoint to register service safepoint. + Duration::from_secs(60 * 60 * 2), + ) + .await + { + Error::from(err).report("failed to update service safe point!"); + // don't give up? + } + + // Currently, we only support one task at the same time, + // so use the task as label would be ok. + metrics::STORE_CHECKPOINT_TS + .with_label_values(&[task]) + .set(rts as _); + Ok(()) + } +} + +pub struct CheckpointV3FlushObserver { + /// We should modify the rts (the local rts isn't right.) + /// This should be a BasicFlushObserver or something likewise. + baseline: O, + sched: Scheduler, + meta_cli: MetadataClient, + + checkpoints: Vec, + global_checkpoint_cache: HashMap, + start_time: Instant, +} + +impl CheckpointV3FlushObserver { + pub fn new(sched: Scheduler, meta_cli: MetadataClient, baseline: O) -> Self { + Self { + sched, + meta_cli, + checkpoints: vec![], + // We almost always have only one entry. + global_checkpoint_cache: HashMap::with_capacity(1), + baseline, + start_time: Instant::now(), + } + } +} + +impl CheckpointV3FlushObserver +where + S: MetaStore + 'static, + O: FlushObserver + Send, +{ + async fn get_checkpoint(&mut self, task: &str) -> Result { + let cp = match self.global_checkpoint_cache.get(task) { + Some(cp) => *cp, + None => { + let global_checkpoint = self.meta_cli.global_checkpoint_of_task(task).await?; + self.global_checkpoint_cache + .insert(task.to_owned(), global_checkpoint); + global_checkpoint + } + }; + Ok(cp) + } +} + +#[async_trait::async_trait] +impl FlushObserver for CheckpointV3FlushObserver +where + S: MetaStore + 'static, + O: FlushObserver + Send, +{ + async fn before(&mut self, checkpoints: Vec) { + self.checkpoints = checkpoints; + } + + async fn after(&mut self, task: &str, _rts: u64) -> Result<()> { + let resolve_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolved { + checkpoints: std::mem::take(&mut self.checkpoints), + start_time: self.start_time, + }); + let flush_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Flush); + try_send!(self.sched, resolve_task); + try_send!(self.sched, flush_task); + + let global_checkpoint = self.get_checkpoint(task).await?; + info!("getting global checkpoint from cache for updating."; "checkpoint" => ?global_checkpoint); + self.baseline + .after(task, global_checkpoint.ts.into_inner()) + .await?; + Ok(()) + } + + async fn rewrite_resolved_ts(&mut self, task: &str) -> Option { + let global_checkpoint = self + .get_checkpoint(task) + .await + .map_err(|err| err.report("failed to get resolved ts for rewriting")) + .ok()?; + info!("getting global checkpoint for updating."; "checkpoint" => ?global_checkpoint); + matches!(global_checkpoint.provider, CheckpointProvider::Global) + .then(|| global_checkpoint.ts) + } +} + +#[cfg(test)] +pub mod tests { + use std::{ + assert_matches, + collections::HashMap, + sync::{Arc, Mutex, RwLock}, + time::Duration, + }; + + use futures::{future::ok, Sink}; + use grpcio::{RpcStatus, RpcStatusCode}; + use kvproto::{logbackuppb::SubscribeFlushEventResponse, metapb::*}; + use pd_client::{PdClient, PdFuture}; + use txn_types::TimeStamp; + + use super::{BasicFlushObserver, FlushObserver, RegionIdWithVersion}; + use crate::{ + subscription_track::{CheckpointType, ResolveResult}, + GetCheckpointResult, + }; + + fn region(id: u64, version: u64, conf_version: u64) -> Region { + let mut r = Region::new(); + let mut e = RegionEpoch::new(); + e.set_version(version); + e.set_conf_ver(conf_version); + r.set_id(id); + r.set_region_epoch(e); + r + } + + #[derive(Clone)] + pub struct MockSink(Arc>); + + impl MockSink { + fn with_fail_once(code: RpcStatusCode) -> Self { + let mut failed = false; + let inner = MockSinkInner { + items: Vec::default(), + closed: false, + on_error: Box::new(move || { + if failed { + RpcStatusCode::OK + } else { + failed = true; + code + } + }), + }; + Self(Arc::new(Mutex::new(inner))) + } + + fn trivial() -> Self { + let inner = MockSinkInner { + items: Vec::default(), + closed: false, + on_error: Box::new(|| RpcStatusCode::OK), + }; + Self(Arc::new(Mutex::new(inner))) + } + + pub async fn fail(&self, status: RpcStatus) -> crate::errors::Result<()> { + panic!("failed in a case should never fail: {}", status); + } + } + + struct MockSinkInner { + items: Vec, + closed: bool, + on_error: Box grpcio::RpcStatusCode + Send>, + } + + impl Sink<(SubscribeFlushEventResponse, grpcio::WriteFlags)> for MockSink { + type Error = grpcio::Error; + + fn poll_ready( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Ok(()).into() + } + + fn start_send( + self: std::pin::Pin<&mut Self>, + item: (SubscribeFlushEventResponse, grpcio::WriteFlags), + ) -> Result<(), Self::Error> { + let mut guard = self.0.lock().unwrap(); + let code = (guard.on_error)(); + if code != RpcStatusCode::OK { + return Err(grpcio::Error::RpcFailure(RpcStatus::new(code))); + } + guard.items.push(item.0); + Ok(()) + } + + fn poll_flush( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Ok(()).into() + } + + fn poll_close( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let mut guard = self.0.lock().unwrap(); + guard.closed = true; + Ok(()).into() + } + } + + fn simple_resolve_result() -> ResolveResult { + let mut region = Region::new(); + region.set_id(42); + ResolveResult { + region, + checkpoint: 42.into(), + checkpoint_type: CheckpointType::MinTs, + } + } + + #[test] + fn test_rpc_sub() { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .build() + .unwrap(); + let mut mgr = super::CheckpointManager::default(); + rt.spawn(mgr.spawn_subscription_mgr()); + + let trivial_sink = MockSink::trivial(); + rt.block_on(mgr.add_subscriber(trivial_sink.clone())) + .unwrap(); + + mgr.resolve_regions(vec![simple_resolve_result()]); + mgr.flush(); + mgr.sync_with_subs_mgr(|_| {}); + assert_eq!(trivial_sink.0.lock().unwrap().items.len(), 1); + } + + #[test] + fn test_rpc_failure() { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .build() + .unwrap(); + let mut mgr = super::CheckpointManager::default(); + rt.spawn(mgr.spawn_subscription_mgr()); + + let error_sink = MockSink::with_fail_once(RpcStatusCode::INTERNAL); + rt.block_on(mgr.add_subscriber(error_sink.clone())).unwrap(); + + mgr.resolve_regions(vec![simple_resolve_result()]); + mgr.flush(); + assert_eq!(mgr.sync_with_subs_mgr(|item| { item.subscribers.len() }), 0); + let sink = error_sink.0.lock().unwrap(); + assert_eq!(sink.items.len(), 0); + // The stream shouldn't be closed when exit by a failure. + assert_eq!(sink.closed, false); + } + + #[test] + fn test_flush() { + let mut mgr = super::CheckpointManager::default(); + mgr.do_update(region(1, 32, 8), TimeStamp::new(8)); + mgr.do_update(region(2, 34, 8), TimeStamp::new(15)); + mgr.do_update(region(2, 35, 8), TimeStamp::new(16)); + mgr.do_update(region(2, 35, 8), TimeStamp::new(14)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + + mgr.flush(); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 8); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 35)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 16); + mgr.flush(); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + } + + #[test] + fn test_mgr() { + let mut mgr = super::CheckpointManager::default(); + mgr.update_region_checkpoint(®ion(1, 32, 8), TimeStamp::new(8)); + mgr.update_region_checkpoint(®ion(2, 34, 8), TimeStamp::new(15)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 33)); + assert_matches::assert_matches!(r, GetCheckpointResult::EpochNotMatch { .. }); + let r = mgr.get_from_region(RegionIdWithVersion::new(3, 44)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + + mgr.update_region_checkpoint(®ion(1, 30, 8), TimeStamp::new(16)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + + mgr.update_region_checkpoint(®ion(1, 30, 8), TimeStamp::new(16)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + mgr.update_region_checkpoint(®ion(1, 32, 8), TimeStamp::new(16)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 16); + mgr.update_region_checkpoint(®ion(1, 33, 8), TimeStamp::new(24)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 33)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); + } + + pub struct MockPdClient { + safepoint: RwLock>, + } + + impl PdClient for MockPdClient { + fn update_service_safe_point( + &self, + name: String, + safepoint: TimeStamp, + _ttl: Duration, + ) -> PdFuture<()> { + // let _ = self.safepoint.insert(name, safepoint); + self.safepoint.write().unwrap().insert(name, safepoint); + + Box::pin(ok(())) + } + } + + impl MockPdClient { + fn new() -> Self { + Self { + safepoint: RwLock::new(HashMap::default()), + } + } + + fn get_service_safe_point(&self, name: String) -> Option { + self.safepoint.read().unwrap().get(&name).copied() + } + } + + #[tokio::test] + async fn test_after() { + let store_id = 1; + let pd_cli = Arc::new(MockPdClient::new()); + let mut flush_observer = BasicFlushObserver::new(pd_cli.clone(), store_id); + let task = String::from("test"); + let rts = 12345; + + let r = flush_observer.after(&task, rts).await; + assert_eq!(r.is_ok(), true); + + let serivce_id = format!("backup-stream-{}-{}", task, store_id); + let r = pd_cli.get_service_safe_point(serivce_id).unwrap(); + assert_eq!(r.into_inner(), rts - 1); + } +} diff --git a/components/backup-stream/src/config.rs b/components/backup-stream/src/config.rs index dfee838c333..03afa47dd97 100644 --- a/components/backup-stream/src/config.rs +++ b/components/backup-stream/src/config.rs @@ -1,26 +1,40 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use online_config::{ConfigChange, ConfigManager}; -use tikv_util::worker::Scheduler; +use std::sync::{Arc, RwLock}; + +use online_config::{ConfigChange, ConfigManager, OnlineConfig}; +use tikv::config::BackupStreamConfig; +use tikv_util::{info, worker::Scheduler}; use crate::endpoint::Task; -pub struct BackupStreamConfigManager(pub Scheduler); +#[derive(Clone)] +pub struct BackupStreamConfigManager { + pub scheduler: Scheduler, + pub config: Arc>, +} + +impl BackupStreamConfigManager { + pub fn new(scheduler: Scheduler, cfg: BackupStreamConfig) -> Self { + let config = Arc::new(RwLock::new(cfg)); + Self { scheduler, config } + } +} impl ConfigManager for BackupStreamConfigManager { fn dispatch( &mut self, change: ConfigChange, ) -> std::result::Result<(), Box> { - self.0.schedule(Task::ChangeConfig(change))?; - Ok(()) - } -} + info!( + "log backup config changed"; + "change" => ?change, + ); + let mut cfg = self.config.as_ref().write().unwrap(); + cfg.update(change)?; + cfg.validate()?; -impl std::ops::Deref for BackupStreamConfigManager { - type Target = Scheduler; - - fn deref(&self) -> &Self::Target { - &self.0 + self.scheduler.schedule(Task::ChangeConfig(cfg.clone()))?; + Ok(()) } } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 470ee53bb87..c88b36da8db 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1,11 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - fmt, - marker::PhantomData, - path::PathBuf, - sync::{atomic::Ordering, Arc}, - time::Duration, + collections::HashSet, fmt, marker::PhantomData, path::PathBuf, sync::Arc, time::Duration, }; use concurrency_manager::ConcurrencyManager; @@ -16,20 +12,19 @@ use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, }; -use online_config::ConfigChange; use pd_client::PdClient; -use raft::StateRole; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, - router::RaftStoreRouter, - store::fsm::ChangeObserver, + router::CdcHandle, }; +use resolved_ts::{resolve_by_raft, LeadershipResolver}; use tikv::config::BackupStreamConfig; use tikv_util::{ box_err, config::ReadableDuration, debug, defer, info, - time::Instant, + sys::thread::ThreadBuildWrapper, + time::{Instant, Limiter}, warn, worker::{Runnable, Scheduler}, HandyRwLock, @@ -37,49 +32,72 @@ use tikv_util::{ use tokio::{ io::Result as TokioResult, runtime::{Handle, Runtime}, + sync::oneshot, }; use tokio_stream::StreamExt; use txn_types::TimeStamp; -use yatp::task::callback::Handle as YatpHandle; use super::metrics::HANDLE_EVENT_DURATION_HISTOGRAM; use crate::{ annotate, + checkpoint_manager::{ + BasicFlushObserver, CheckpointManager, CheckpointV3FlushObserver, FlushObserver, + GetCheckpointResult, RegionIdWithVersion, Subscription, + }, errors::{Error, Result}, event_loader::{InitialDataLoader, PendingMemoryQuota}, + future, metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, observer::BackupStreamObserver, - router::{ApplyEvents, Router, FLUSH_STORAGE_INTERVAL}, - subscription_track::SubscriptionTracer, + router::{ApplyEvents, Router, TaskSelector}, + subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, + subscription_track::{Ref, RefMut, ResolveResult, SubscriptionTracer}, try_send, - utils::{self, StopWatch}, + utils::{self, CallbackWaitGroup, StopWatch, Work}, }; const SLOW_EVENT_THRESHOLD: f64 = 120.0; +/// CHECKPOINT_SAFEPOINT_TTL_IF_ERROR specifies the safe point TTL(24 hour) if +/// task has fatal error. +const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; +/// The timeout for tick updating the checkpoint. +/// Generally, it would take ~100ms. +/// 5s would be enough for it. +const TICK_UPDATE_TIMEOUT: Duration = Duration::from_secs(5); pub struct Endpoint { - meta_client: MetadataClient, + // Note: those fields are more like a shared context between components. + // For now, we copied them everywhere, maybe we'd better extract them into a + // context type. + pub(crate) meta_client: MetadataClient, + pub(crate) scheduler: Scheduler, + pub(crate) store_id: u64, + pub(crate) regions: R, + pub(crate) engine: PhantomData, + pub(crate) router: RT, + pub(crate) pd_client: Arc, + pub(crate) subs: SubscriptionTracer, + pub(crate) concurrency_manager: ConcurrencyManager, + range_router: Router, - scheduler: Scheduler, observer: BackupStreamObserver, pool: Runtime, - store_id: u64, - regions: R, - engine: PhantomData, - router: RT, - pd_client: Arc, - subs: SubscriptionTracer, - concurrency_manager: ConcurrencyManager, initial_scan_memory_quota: PendingMemoryQuota, - scan_pool: ScanPool, + initial_scan_throughput_quota: Limiter, + region_operator: RegionSubscriptionManager, + failover_time: Option, + // We holds the config before, even it is useless for now, + // however probably it would be useful in the future. + config: BackupStreamConfig, + checkpoint_mgr: CheckpointManager, } impl Endpoint where R: RegionInfoProvider + 'static + Clone, E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, PDC: PdClient + 'static, S: MetaStore + 'static, { @@ -93,17 +111,17 @@ where router: RT, pd_client: Arc, concurrency_manager: ConcurrencyManager, + resolver: BackupStreamResolver, ) -> Self { crate::metrics::STREAM_ENABLED.inc(); - let pool = create_tokio_runtime(config.io_threads, "backup-stream") + let pool = create_tokio_runtime((config.num_threads / 2).max(1), "backup-stream") .expect("failed to create tokio runtime for backup stream worker."); - let scan_pool = create_scan_pool(config.num_threads); let meta_client = MetadataClient::new(store, store_id); let range_router = Router::new( PathBuf::from(config.temp_path.clone()), scheduler.clone(), - config.temp_file_size_limit_per_task.0, + config.file_size_limit.0, config.max_flush_interval.0, ); @@ -122,8 +140,36 @@ where let initial_scan_memory_quota = PendingMemoryQuota::new(config.initial_scan_pending_memory_quota.0 as _); + let limit = if config.initial_scan_rate_limit.0 > 0 { + config.initial_scan_rate_limit.0 as f64 + } else { + f64::INFINITY + }; + let initial_scan_throughput_quota = Limiter::new(limit); info!("the endpoint of stream backup started"; "path" => %config.temp_path); - Endpoint { + let subs = SubscriptionTracer::default(); + + let (region_operator, op_loop) = RegionSubscriptionManager::start( + InitialDataLoader::new( + router.clone(), + accessor.clone(), + range_router.clone(), + subs.clone(), + scheduler.clone(), + initial_scan_memory_quota.clone(), + pool.handle().clone(), + initial_scan_throughput_quota.clone(), + ), + observer.clone(), + meta_client.clone(), + pd_client.clone(), + ((config.num_threads + 1) / 2).max(1), + resolver, + ); + pool.spawn(op_loop); + let mut checkpoint_mgr = CheckpointManager::default(); + pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); + let ep = Endpoint { meta_client, range_router, scheduler, @@ -134,11 +180,17 @@ where engine: PhantomData, router, pd_client, - subs: Default::default(), + subs, concurrency_manager, initial_scan_memory_quota, - scan_pool, - } + initial_scan_throughput_quota, + region_operator, + failover_time: None, + config, + checkpoint_mgr, + }; + ep.pool.spawn(ep.min_ts_worker()); + ep } } @@ -147,59 +199,74 @@ where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, PDC: PdClient + 'static, { fn get_meta_client(&self) -> MetadataClient { self.meta_client.clone() } - fn on_fatal_error(&self, task: String, err: Box) { - // Let's pause the task first. - self.unload_task(&task); + fn on_fatal_error(&self, select: TaskSelector, err: Box) { err.report_fatal(); - metrics::update_task_status(TaskStatus::Error, &task); - - let meta_cli = self.get_meta_client(); - let pdc = self.pd_client.clone(); - let store_id = self.store_id; - let sched = self.scheduler.clone(); - let safepoint_name = self.pause_guard_id_for_task(&task); - let safepoint_ttl = self.pause_guard_duration(); - self.pool.block_on(async move { - let err_fut = async { - let safepoint = meta_cli.global_progress_of_task(&task).await?; - pdc.update_service_safe_point( - safepoint_name, - TimeStamp::new(safepoint), - safepoint_ttl, - ) - .await?; - meta_cli.pause(&task).await?; - let mut last_error = StreamBackupError::new(); - last_error.set_error_code(err.error_code().code.to_owned()); - last_error.set_error_message(err.to_string()); - last_error.set_store_id(store_id); - last_error.set_happen_at(TimeStamp::physical_now()); - meta_cli.report_last_error(&task, last_error).await?; - Result::Ok(()) - }; - if let Err(err_report) = err_fut.await { - err_report.report(format_args!("failed to upload error {}", err_report)); - // Let's retry reporting after 5s. - tokio::task::spawn(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - try_send!(sched, Task::FatalError(task, err)); - }); - } - }) + let tasks = self + .pool + .block_on(self.range_router.select_task(select.reference())); + warn!("fatal error reporting"; "selector" => ?select, "selected" => ?tasks, "err" => %err); + for task in tasks { + // Let's pause the task first. + self.unload_task(&task); + metrics::update_task_status(TaskStatus::Error, &task); + + let meta_cli = self.get_meta_client(); + let pdc = self.pd_client.clone(); + let store_id = self.store_id; + let sched = self.scheduler.clone(); + let safepoint_name = self.pause_guard_id_for_task(&task); + let safepoint_ttl = self.pause_guard_duration(); + let code = err.error_code().code.to_owned(); + let msg = err.to_string(); + self.pool.block_on(async move { + let err_fut = async { + let safepoint = meta_cli.global_progress_of_task(&task).await?; + pdc.update_service_safe_point( + safepoint_name, + TimeStamp::new(safepoint.saturating_sub(1)), + safepoint_ttl, + ) + .await?; + meta_cli.pause(&task).await?; + let mut last_error = StreamBackupError::new(); + last_error.set_error_code(code); + last_error.set_error_message(msg.clone()); + last_error.set_store_id(store_id); + last_error.set_happen_at(TimeStamp::physical_now()); + meta_cli.report_last_error(&task, last_error).await?; + Result::Ok(()) + }; + if let Err(err_report) = err_fut.await { + err_report.report(format_args!("failed to upload error {}", err_report)); + // Let's retry reporting after 5s. + tokio::task::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + try_send!( + sched, + Task::FatalError( + TaskSelector::ByName(task.to_owned()), + Box::new(annotate!(err_report, "origin error: {}", msg)) + ) + ); + }); + } + }); + } } async fn starts_flush_ticks(router: Router) { loop { - // check every 15s. - // TODO: maybe use global timer handle in the `tikv_utils::timer` (instead of enabling timing in the current runtime)? - tokio::time::sleep(Duration::from_secs(FLUSH_STORAGE_INTERVAL / 20)).await; + // check every 5s. + // TODO: maybe use global timer handle in the `tikv_utils::timer` (instead of + // enabling timing in the current runtime)? + tokio::time::sleep(Duration::from_secs(5)).await; debug!("backup stream trigger flush tick"); router.tick().await; } @@ -210,12 +277,29 @@ where meta_client: MetadataClient, scheduler: Scheduler, ) -> Result<()> { - let tasks = meta_client.get_tasks().await?; + let tasks; + loop { + let r = meta_client.get_tasks().await; + match r { + Ok(t) => { + tasks = t; + break; + } + Err(e) => { + e.report("failed to get backup stream task"); + tokio::time::sleep(Duration::from_secs(5)).await; + continue; + } + } + } + for task in tasks.inner { info!("backup stream watch task"; "task" => ?task); if task.is_paused { continue; } + // We have meet task upon store start, we must in a failover. + scheduler.schedule(Task::MarkFailover(Instant::now()))?; // move task to schedule scheduler.schedule(Task::WatchTask(TaskOp::AddTask(task)))?; } @@ -252,19 +336,21 @@ where let mut watcher = match watcher { Ok(w) => w, Err(e) => { - e.report("failed to start watch pause"); + e.report("failed to start watch task"); tokio::time::sleep(Duration::from_secs(5)).await; continue; } }; + info!("start watching the task changes."; "from_rev" => %revision_new); loop { if let Some(event) = watcher.stream.next().await { - info!("backup stream watch event from etcd"; "event" => ?event); + info!("backup stream watch task from etcd"; "event" => ?event); let revision = meta_client.get_reversion().await; if let Ok(r) = revision { revision_new = r; + info!("update the revision"; "revision" => revision_new); } match event { @@ -279,7 +365,7 @@ where tokio::time::sleep(Duration::from_secs(2)).await; break; } - _ => panic!("BUG: invalid event {:?}", event), + _ => warn!("BUG: invalid event"; "event" => ?event), } } else { tokio::time::sleep(Duration::from_secs(1)).await; @@ -306,13 +392,15 @@ where continue; } }; + info!("start watching the pausing events."; "from_rev" => %revision_new); loop { if let Some(event) = watcher.stream.next().await { - info!("backup stream watch event from etcd"; "event" => ?event); + info!("backup stream watch pause from etcd"; "event" => ?event); let revision = meta_client.get_reversion().await; if let Ok(r) = revision { revision_new = r; + info!("update the revision"; "revision" => revision_new); } match event { @@ -327,7 +415,7 @@ where tokio::time::sleep(Duration::from_secs(2)).await; break; } - _ => panic!("BUG: invalid event {:?}", event), + _ => warn!("BUG: invalid event"; "event" => ?event), } } else { tokio::time::sleep(Duration::from_secs(1)).await; @@ -337,7 +425,13 @@ where } } - /// Convert a batch of events to the cmd batch, and update the resolver status. + fn flush_observer(&self) -> impl FlushObserver { + let basic = BasicFlushObserver::new(self.pd_client.clone(), self.store_id); + CheckpointV3FlushObserver::new(self.scheduler.clone(), self.meta_client.clone(), basic) + } + + /// Convert a batch of events to the cmd batch, and update the resolver + /// status. fn record_batch(subs: SubscriptionTracer, batch: CmdBatch) -> Option { let region_id = batch.region_id; let mut resolver = match subs.get_subscription_of(region_id) { @@ -347,7 +441,9 @@ where return None; } }; - // Stale data is accpetable, while stale locks may block the checkpoint advancing. + // Stale data is acceptable, while stale locks may block the checkpoint + // advancing. + // ```text // Let L be the instant some key locked, U be the instant it unlocked, // +---------*-------L-----------U--*-------------+ // ^ ^----(1)----^ ^ We get the snapshot for initial scanning at here. @@ -356,6 +452,7 @@ where // ...note that (1) is the last cmd batch of first observing, so the unlock event would never be sent to us. // ...then the lock would get an eternal life in the resolver :| // (Before we refreshing the resolver for this region again) + // ``` if batch.pitr_id != resolver.value().handle.id { debug!("stale command"; "region_id" => %region_id, "now" => ?resolver.value().handle.id, "remote" => ?batch.pitr_id); return None; @@ -365,8 +462,8 @@ where Some(kvs) } - fn backup_batch(&self, batch: CmdBatch) { - let mut sw = StopWatch::new(); + fn backup_batch(&self, batch: CmdBatch, work: Work) { + let mut sw = StopWatch::by_now(); let router = self.range_router.clone(); let sched = self.scheduler.clone(); @@ -395,7 +492,8 @@ where } HANDLE_EVENT_DURATION_HISTOGRAM .with_label_values(&["save_to_temp_file"]) - .observe(time_cost) + .observe(time_cost); + drop(work) }); } @@ -409,6 +507,7 @@ where self.scheduler.clone(), self.initial_scan_memory_quota.clone(), self.pool.handle().clone(), + self.initial_scan_throughput_quota.clone(), ) } @@ -449,20 +548,22 @@ where "end_key" => utils::redact(&end_key), ); } - self.spawn_at_scan_pool(move || { - let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); - match range_init_result { - Ok(()) => { - info!("backup stream success to initialize"; + // Assuming the `region info provider` would read region info form `StoreMeta` + // directly and this would be fast. If this gets slow, maybe make it async + // again. (Will that bring race conditions? say `Start` handled after + // `ResfreshResolver` of some region.) + let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); + match range_init_result { + Ok(()) => { + info!("backup stream success to initialize"; "start_key" => utils::redact(&start_key), "end_key" => utils::redact(&end_key), "take" => ?start.saturating_elapsed(),) - } - Err(e) => { - e.report("backup stream initialize failed"); - } } - }); + Err(e) => { + e.report("backup stream initialize failed"); + } + } Ok(()) } @@ -502,50 +603,43 @@ where }), ); self.pool.block_on(async move { - let task_name = task.info.get_name(); - match cli.ranges_of_task(task_name).await { - Ok(ranges) => { - info!( - "register backup stream ranges"; - "task" => ?task, - "ranges-count" => ranges.inner.len(), - ); - let ranges = ranges - .inner - .into_iter() - .map(|(start_key, end_key)| { - (utils::wrap_key(start_key), utils::wrap_key(end_key)) - }) - .collect::>(); - if let Err(err) = range_router - .register_task(task.clone(), ranges.clone()) - .await - { - err.report(format!( - "failed to register backup stream task {}", - task.info.name - )); - return; - } - - for (start_key, end_key) in ranges { - let init = init.clone(); - - self.observe_and_scan_region(init, &task, start_key, end_key) - .await - .unwrap(); - } - info!( - "finish register backup stream ranges"; - "task" => ?task, - ); - } - Err(e) => { - e.report(format!( - "failed to register backup stream task {} to router: ranges not found", - task.info.get_name() - )); + let task_clone = task.clone(); + let run = async move { + let task_name = task.info.get_name(); + let ranges = cli.ranges_of_task(task_name).await?; + info!( + "register backup stream ranges"; + "task" => ?task, + "ranges-count" => ranges.inner.len(), + ); + let ranges = ranges + .inner + .into_iter() + .map(|(start_key, end_key)| { + (utils::wrap_key(start_key), utils::wrap_key(end_key)) + }) + .collect::>(); + range_router + .register_task(task.clone(), ranges.clone(), self.config.file_size_limit.0) + .await?; + + for (start_key, end_key) in ranges { + let init = init.clone(); + + self.observe_and_scan_region(init, &task, start_key, end_key) + .await? } + info!( + "finish register backup stream ranges"; + "task" => ?task, + ); + Result::Ok(()) + }; + if let Err(e) = run.await { + e.report(format!( + "failed to register backup stream task {} to router: ranges not found", + task_clone.info.get_name() + )); } }); metrics::update_task_status(TaskStatus::Running, &task_name); @@ -556,7 +650,7 @@ where } fn pause_guard_duration(&self) -> Duration { - ReadableDuration::hours(24).0 + ReadableDuration::hours(CHECKPOINT_SAFEPOINT_TTL_IF_ERROR).0 } pub fn on_pause(&self, task: &str) { @@ -587,15 +681,24 @@ where pub fn on_unregister(&self, task: &str) -> Option { let info = self.unload_task(task); - - // reset the checkpoint ts of the task so it won't mislead the metrics. - metrics::STORE_CHECKPOINT_TS - .with_label_values(&[task]) - .set(0); + self.remove_metrics_after_unregister(task); info } - /// unload a task from memory: this would stop observe the changes required by the task temporarily. + fn remove_metrics_after_unregister(&self, task: &str) { + // remove metrics of the task so it won't mislead the metrics. + let _ = metrics::STORE_CHECKPOINT_TS + .remove_label_values(&[task]) + .map_err( + |err| info!("failed to remove checkpoint ts metric"; "task" => task, "err" => %err), + ); + let _ = metrics::remove_task_status_metric(task).map_err( + |err| info!("failed to remove checkpoint ts metric"; "task" => task, "err" => %err), + ); + } + + /// unload a task from memory: this would stop observe the changes required + /// by the task temporarily. fn unload_task(&self, task: &str) -> Option { let router = self.range_router.clone(); @@ -606,357 +709,176 @@ where self.pool.block_on(router.unregister_task(task)) } - /// try advance the resolved ts by the pd tso. - async fn try_resolve( - cm: &ConcurrencyManager, - pd_client: Arc, - resolvers: SubscriptionTracer, - ) -> TimeStamp { - let pd_tso = pd_client - .get_tso() - .await - .map_err(|err| Error::from(err).report("failed to get tso from pd")) - .unwrap_or_default(); - cm.update_max_ts(pd_tso); - let min_ts = cm.global_min_lock_ts().unwrap_or(TimeStamp::max()); - let tso = Ord::min(pd_tso, min_ts); - let ts = resolvers.resolve_with(tso); - resolvers.warn_if_gap_too_huge(ts); - ts - } - - async fn flush_for_task( - task: String, - store_id: u64, - router: Router, - pd_cli: Arc, - resolvers: SubscriptionTracer, - meta_cli: MetadataClient, - concurrency_manager: ConcurrencyManager, - ) { - let start = Instant::now_coarse(); - // NOTE: Maybe push down the resolve step to the router? - // Or if there are too many duplicated `Flush` command, we may do some useless works. - let new_rts = Self::try_resolve(&concurrency_manager, pd_cli.clone(), resolvers).await; - #[cfg(feature = "failpoints")] - fail::fail_point!("delay_on_flush"); - metrics::FLUSH_DURATION - .with_label_values(&["resolve_by_now"]) - .observe(start.saturating_elapsed_secs()); - if let Some(rts) = router.do_flush(&task, store_id, new_rts).await { - info!("flushing and refreshing checkpoint ts."; - "checkpoint_ts" => %rts, - "task" => %task, - ); - if rts == 0 { - // We cannot advance the resolved ts for now. - return; - } - let in_flight = crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.load(Ordering::SeqCst); - if in_flight > 0 { - warn!("inflight leader detected, skipping advancing resolved ts"; "in_flight" => %in_flight); - return; - } - if let Err(err) = pd_cli - .update_service_safe_point( - format!("backup-stream-{}-{}", task, store_id), - TimeStamp::new(rts), - // Add a service safe point for 30 mins (6x the default flush interval). - // It would probably be safe. - Duration::from_secs(1800), - ) + fn prepare_min_ts(&self) -> future![TimeStamp] { + let pd_cli = self.pd_client.clone(); + let cm = self.concurrency_manager.clone(); + async move { + let pd_tso = pd_cli + .get_tso() .await - { - Error::from(err).report("failed to update service safe point!"); - // don't give up? - } - if let Err(err) = meta_cli.step_task(&task, rts).await { - err.report(format!("on flushing task {}", task)); - // we can advance the progress at next time. - // return early so we won't be mislead by the metrics. - return; - } - metrics::STORE_CHECKPOINT_TS - // Currently, we only support one task at the same time, - // so use the task as label would be ok. - .with_label_values(&[task.as_str()]) - .set(rts as _) + .map_err(|err| Error::from(err).report("failed to get tso from pd")) + .unwrap_or_default(); + cm.update_max_ts(pd_tso); + let min_ts = cm.global_min_lock_ts().unwrap_or(TimeStamp::max()); + Ord::min(pd_tso, min_ts) } } - pub fn on_force_flush(&self, task: String, store_id: u64) { - let router = self.range_router.clone(); - let cli = self.meta_client.clone(); - let pd_cli = self.pd_client.clone(); - let resolvers = self.subs.clone(); - let cm = self.concurrency_manager.clone(); - self.pool.spawn(async move { - let info = router.get_task_info(&task).await; - // This should only happen in testing, it would be to unwrap... - let _ = info.unwrap().set_flushing_status_cas(false, true); - Self::flush_for_task(task, store_id, router, pd_cli, resolvers, cli, cm).await; - }); + fn get_resolved_regions(&self, min_ts: TimeStamp) -> future![Result] { + let (tx, rx) = oneshot::channel(); + let op = self.region_operator.clone(); + async move { + let req = ObserveOp::ResolveRegions { + callback: Box::new(move |rs| { + let _ = tx.send(rs); + }), + min_ts, + }; + op.request(req).await; + rx.await + .map_err(|err| annotate!(err, "failed to send request for resolve regions")) + } } - pub fn on_flush(&self, task: String, store_id: u64) { + fn do_flush(&self, task: String, min_ts: TimeStamp) -> future![Result<()>] { + let get_rts = self.get_resolved_regions(min_ts); let router = self.range_router.clone(); - let cli = self.meta_client.clone(); - let pd_cli = self.pd_client.clone(); - let resolvers = self.subs.clone(); - let cm = self.concurrency_manager.clone(); - self.pool.spawn(Self::flush_for_task( - task, store_id, router, pd_cli, resolvers, cli, cm, - )); - } - - /// Start observe over some region. - /// This would modify some internal state, and delegate the task to InitialLoader::observe_over. - fn observe_over(&self, region: &Region, handle: ObserveHandle) -> Result<()> { - let init = self.make_initial_loader(); - let region_id = region.get_id(); - self.subs.register_region(region, handle.clone(), None); - init.observe_over_with_retry(region, || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; - Ok(()) - } - - fn observe_over_with_initial_data_from_checkpoint( - &self, - region: &Region, - task: String, - handle: ObserveHandle, - ) -> Result<()> { - let init = self.make_initial_loader(); - - let meta_cli = self.meta_client.clone(); - let last_checkpoint = TimeStamp::new( - self.pool - .block_on(meta_cli.global_progress_of_task(&task))?, - ); - self.subs - .register_region(region, handle.clone(), Some(last_checkpoint)); - - let region_id = region.get_id(); - let snap = init.observe_over_with_retry(region, move || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; - let region = region.clone(); - - // we should not spawn initial scanning tasks to the tokio blocking pool - // beacuse it is also used for converting sync File I/O to async. (for now!) - // In that condition, if we blocking for some resouces(for example, the `MemoryQuota`) - // at the block threads, we may meet some ghosty deadlock. - self.spawn_at_scan_pool(move || { - let begin = Instant::now_coarse(); - match init.do_initial_scan(®ion, last_checkpoint, snap) { - Ok(stat) => { - info!("initial scanning of leader transforming finished!"; "takes" => ?begin.saturating_elapsed(), "region" => %region.get_id(), "from_ts" => %last_checkpoint); - utils::record_cf_stat("lock", &stat.lock); - utils::record_cf_stat("write", &stat.write); - utils::record_cf_stat("default", &stat.data); + let store_id = self.store_id; + let mut flush_ob = self.flush_observer(); + async move { + let mut resolved = get_rts.await?; + let mut new_rts = resolved.global_checkpoint(); + fail::fail_point!("delay_on_flush"); + flush_ob.before(resolved.take_resolve_result()).await; + if let Some(rewritten_rts) = flush_ob.rewrite_resolved_ts(&task).await { + info!("rewriting resolved ts"; "old" => %new_rts, "new" => %rewritten_rts); + new_rts = rewritten_rts.min(new_rts); + } + if let Some(rts) = router.do_flush(&task, store_id, new_rts).await { + info!("flushing and refreshing checkpoint ts."; + "checkpoint_ts" => %rts, + "task" => %task, + ); + if rts == 0 { + // We cannot advance the resolved ts for now. + return Ok(()); } - Err(err) => err.report(format!("during initial scanning of region {:?}", region)), + flush_ob.after(&task, rts).await? } + Ok(()) + } + } + + pub fn on_force_flush(&self, task: String) { + self.pool.block_on(async move { + let info = self.range_router.get_task_info(&task).await; + // This should only happen in testing, it would be to unwrap... + let _ = info.unwrap().set_flushing_status_cas(false, true); + let mts = self.prepare_min_ts().await; + try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); }); - Ok(()) } - // spawn a task at the scan pool. - fn spawn_at_scan_pool(&self, task: impl FnOnce() + Send + 'static) { - self.scan_pool.spawn(move |_: &mut YatpHandle<'_>| { - tikv_alloc::add_thread_memory_accessor(); - let _io_guard = file_system::WithIOType::new(file_system::IOType::Replication); - task(); - tikv_alloc::remove_thread_memory_accessor(); + pub fn on_flush(&self, task: String) { + self.pool.block_on(async move { + let mts = self.prepare_min_ts().await; + info!("min_ts prepared for flushing"; "min_ts" => %mts); + try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); }) } - fn find_task_by_region(&self, r: &Region) -> Option { - self.range_router - .find_task_by_range(&r.start_key, &r.end_key) + fn on_flush_with_min_ts(&self, task: String, min_ts: TimeStamp) { + self.pool.spawn(self.do_flush(task, min_ts).map(|r| { + if let Err(err) = r { + err.report("during updating flush status") + } + })); } - /// Modify observe over some region. - /// This would register the region to the RaftStore. - pub fn on_modify_observe(&self, op: ObserveOp) { - info!("backup stream: on_modify_observe"; "op" => ?op); - match op { - ObserveOp::Start { - region, - needs_initial_scanning, - } => { - #[cfg(feature = "failpoints")] - fail::fail_point!("delay_on_start_observe"); - self.start_observe(region, needs_initial_scanning); - metrics::INITIAL_SCAN_REASON - .with_label_values(&["leader-changed"]) - .inc(); - crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_sub(1, Ordering::SeqCst); - } - ObserveOp::Stop { ref region } => { - self.subs.deregister_region(region, |_, _| true); - } - ObserveOp::CheckEpochAndStop { ref region } => { - self.subs.deregister_region(region, |old, new| { - raftstore::store::util::compare_region_epoch( - old.meta.get_region_epoch(), - new, - true, - true, - false, - ) - .map_err(|err| warn!("check epoch and stop failed."; "err" => %err)) - .is_ok() - }); + fn update_global_checkpoint(&self, task: String) -> future![()] { + let meta_client = self.meta_client.clone(); + let router = self.range_router.clone(); + let store_id = self.store_id; + async move { + #[cfg(feature = "failpoints")] + { + // fail-rs doesn't support async code blocks now. + // let's borrow the feature name and do it ourselves :3 + if std::env::var("LOG_BACKUP_UGC_SLEEP_AND_RETURN").is_ok() { + tokio::time::sleep(Duration::from_secs(100)).await; + return; + } } - ObserveOp::RefreshResolver { ref region } => { - let need_refresh_all = !self.subs.try_update_region(region); - - if need_refresh_all { - let canceled = self.subs.deregister_region(region, |_, _| true); - let handle = ObserveHandle::new(); - if canceled { - let for_task = self.find_task_by_region(region).unwrap_or_else(|| { - panic!( - "BUG: the region {:?} is register to no task but being observed", - region - ) - }); - metrics::INITIAL_SCAN_REASON - .with_label_values(&["region-changed"]) - .inc(); - if let Err(e) = self.observe_over_with_initial_data_from_checkpoint( - region, - for_task, - handle.clone(), - ) { - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { - region: region.clone(), - handle, - err: Box::new(e) - }) + let ts = meta_client.global_progress_of_task(&task).await; + match ts { + Ok(global_checkpoint) => { + let r = router + .update_global_checkpoint(&task, global_checkpoint, store_id) + .await; + match r { + Ok(true) => { + if let Err(err) = meta_client + .set_storage_checkpoint(&task, global_checkpoint) + .await + { + warn!("backup stream failed to set global checkpoint."; + "task" => ?task, + "global-checkpoint" => global_checkpoint, + "err" => ?err, + ); + } + } + Ok(false) => { + debug!("backup stream no need update global checkpoint."; + "task" => ?task, + "global-checkpoint" => global_checkpoint, + ); + } + Err(e) => { + warn!("backup stream failed to update global checkpoint."; + "task" => ?task, + "err" => ?e ); } } } - } - ObserveOp::NotifyFailToStartObserve { - region, - handle, - err, - } => { - info!("retry observe region"; "region" => %region.get_id(), "err" => %err); - // No need for retrying observe canceled. - if err.error_code() == error_code::backup_stream::OBSERVE_CANCELED { - return; - } - match self.retry_observe(region, handle) { - Ok(()) => {} - Err(e) => { - try_send!( - self.scheduler, - Task::FatalError( - format!("While retring to observe region, origin error is {}", err), - Box::new(e) - ) - ); - } + Err(e) => { + warn!("backup stream failed to get global checkpoint."; + "task" => ?task, + "err" => ?e + ); } } } } - fn start_observe(&self, region: Region, needs_initial_scanning: bool) { - let handle = ObserveHandle::new(); - let result = if needs_initial_scanning { - match self.find_task_by_region(®ion) { - None => { - warn!( - "the region {:?} is register to no task but being observed (start_key = {}; end_key = {}; task_stat = {:?}): maybe stale, aborting", - region, - utils::redact(®ion.get_start_key()), - utils::redact(®ion.get_end_key()), - self.range_router - ); - return; - } - - Some(for_task) => self.observe_over_with_initial_data_from_checkpoint( - ®ion, - for_task, - handle.clone(), - ), - } - } else { - self.observe_over(®ion, handle.clone()) - }; + fn on_update_global_checkpoint(&self, task: String) { + let _guard = self.pool.handle().enter(); + let result = self.pool.block_on(tokio::time::timeout( + TICK_UPDATE_TIMEOUT, + self.update_global_checkpoint(task), + )); if let Err(err) = result { - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { - region, - handle, - err: Box::new(err) - }) - ); + warn!("log backup update global checkpoint timed out"; "err" => %err) } } - fn retry_observe(&self, region: Region, handle: ObserveHandle) -> Result<()> { - let (tx, rx) = crossbeam::channel::bounded(1); - self.regions - .find_region_by_id( - region.get_id(), - Box::new(move |item| { - tx.send(item) - .expect("BUG: failed to send to newly created channel."); - }), - ) - .map_err(|err| { - annotate!( - err, - "failed to send request to region info accessor, server maybe too too too busy. (region id = {})", - region.get_id() - ) - })?; - let new_region_info = rx - .recv() - .map_err(|err| annotate!(err, "BUG?: unexpected channel message dropped."))?; - if new_region_info.is_none() { - metrics::SKIP_RETRY - .with_label_values(&["region-absent"]) - .inc(); - return Ok(()); - } - let new_region_info = new_region_info.unwrap(); - if new_region_info.role != StateRole::Leader { - metrics::SKIP_RETRY.with_label_values(&["not-leader"]).inc(); - return Ok(()); - } - let removed = self.subs.deregister_region(®ion, |old, _| { - let should_remove = old.handle().id == handle.id; - if !should_remove { - warn!("stale retry command"; "region" => ?region, "handle" => ?handle, "old_handle" => ?old.handle()); - } - should_remove - }); - if !removed { - metrics::SKIP_RETRY - .with_label_values(&["stale-command"]) - .inc(); - return Ok(()); - } - metrics::INITIAL_SCAN_REASON - .with_label_values(&["retry"]) - .inc(); - self.start_observe(region, true); - Ok(()) + fn on_update_change_config(&mut self, cfg: BackupStreamConfig) { + info!( + "update log backup config"; + "config" => ?cfg, + ); + self.range_router.udpate_config(&cfg); + self.config = cfg; } - pub fn run_task(&self, task: Task) { + /// Modify observe over some region. + /// This would register the region to the RaftStore. + pub fn on_modify_observe(&self, op: ObserveOp) { + self.pool.block_on(self.region_operator.request(op)); + } + + pub fn run_task(&mut self, task: Task) { debug!("run backup stream task"; "task" => ?task, "store_id" => %self.store_id); let now = Instant::now_coarse(); let label = task.label(); @@ -967,12 +889,12 @@ where match task { Task::WatchTask(op) => self.handle_watch_task(op), Task::BatchEvent(events) => self.do_backup(events), - Task::Flush(task) => self.on_flush(task, self.store_id), + Task::Flush(task) => self.on_flush(task), Task::ModifyObserve(op) => self.on_modify_observe(op), - Task::ForceFlush(task) => self.on_force_flush(task, self.store_id), + Task::ForceFlush(task) => self.on_force_flush(task), Task::FatalError(task, err) => self.on_fatal_error(task, err), - Task::ChangeConfig(_) => { - warn!("change config online isn't supported for now.") + Task::ChangeConfig(cfg) => { + self.on_update_change_config(cfg); } Task::Sync(cb, mut cond) => { if cond(&self.range_router) { @@ -985,28 +907,117 @@ where }); } } + Task::MarkFailover(t) => self.failover_time = Some(t), + Task::FlushWithMinTs(task, min_ts) => self.on_flush_with_min_ts(task, min_ts), + Task::RegionCheckpointsOp(s) => self.handle_region_checkpoints_op(s), + Task::UpdateGlobalCheckpoint(task) => self.on_update_global_checkpoint(task), } } - pub fn do_backup(&self, events: Vec) { - for batch in events { - self.backup_batch(batch) + fn min_ts_worker(&self) -> future![()] { + let sched = self.scheduler.clone(); + let interval = self.config.min_ts_interval.0; + async move { + loop { + tokio::time::sleep(interval).await; + try_send!( + sched, + Task::RegionCheckpointsOp(RegionCheckpointOperation::PrepareMinTsForResolve) + ); + } } } -} -type ScanPool = yatp::ThreadPool; + pub fn handle_region_checkpoints_op(&mut self, op: RegionCheckpointOperation) { + match op { + RegionCheckpointOperation::Resolved { + checkpoints, + start_time, + } => { + self.checkpoint_mgr.resolve_regions(checkpoints); + metrics::MIN_TS_RESOLVE_DURATION.observe(start_time.saturating_elapsed_secs()); + } + RegionCheckpointOperation::Flush => { + self.checkpoint_mgr.flush(); + } + RegionCheckpointOperation::Get(g, cb) => { + let _guard = self.pool.handle().enter(); + match g { + RegionSet::Universal => cb(self + .checkpoint_mgr + .get_all() + .into_iter() + .map(|c| GetCheckpointResult::ok(c.region.clone(), c.checkpoint)) + .collect()), + RegionSet::Regions(rs) => cb(rs + .iter() + .map(|(id, version)| { + self.checkpoint_mgr + .get_from_region(RegionIdWithVersion::new(*id, *version)) + }) + .collect()), + } + } + RegionCheckpointOperation::Subscribe(sub) => { + let fut = self.checkpoint_mgr.add_subscriber(sub); + self.pool.spawn(async move { + if let Err(err) = fut.await { + err.report("adding subscription"); + } + }); + } + RegionCheckpointOperation::PrepareMinTsForResolve => { + if self.observer.is_hibernating() { + metrics::MISC_EVENTS.skip_resolve_no_subscription.inc(); + return; + } + let min_ts = self.pool.block_on(self.prepare_min_ts()); + let start_time = Instant::now(); + // We need to reschedule the `Resolve` task to queue, because the subscription + // is asynchronous -- there may be transactions committed before + // the min_ts we prepared but haven't been observed yet. + try_send!( + self.scheduler, + Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolve { + min_ts, + start_time + }) + ); + } + RegionCheckpointOperation::Resolve { min_ts, start_time } => { + let sched = self.scheduler.clone(); + try_send!( + self.scheduler, + Task::ModifyObserve(ObserveOp::ResolveRegions { + callback: Box::new(move |mut resolved| { + let t = + Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolved { + checkpoints: resolved.take_resolve_result(), + start_time, + }); + try_send!(sched, t); + }), + min_ts + }) + ); + } + } + } -/// Create a yatp pool for doing initial scanning. -fn create_scan_pool(num_threads: usize) -> ScanPool { - yatp::Builder::new("log-backup-scan") - .max_thread_count(num_threads) - .build_callback_pool() + pub fn do_backup(&self, events: Vec) { + let wg = CallbackWaitGroup::new(); + for batch in events { + self.backup_batch(batch, wg.clone().work()); + } + self.pool.block_on(wg.wait()) + } } /// Create a standard tokio runtime /// (which allows io and time reactor, involve thread memory accessor), fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult { + info!("create tokio runtime for backup stream"; "thread_name" => thread_name, "thread-count" => thread_count); + tokio::runtime::Builder::new_multi_thread() .thread_name(thread_name) // Maybe make it more configurable? @@ -1016,36 +1027,114 @@ fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult { + // for raftstore-v1, we use LeadershipResolver to check leadership of a region. + V1(LeadershipResolver), + // for raftstore-v2, it has less regions. we use CDCHandler to check leadership of a region. + V2(RT, PhantomData), +} + +impl BackupStreamResolver +where + RT: CdcHandle + 'static, + EK: KvEngine, +{ + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + match self { + BackupStreamResolver::V1(x) => x.resolve(regions, min_ts).await, + BackupStreamResolver::V2(x, _) => { + let x = x.clone(); + resolve_by_raft(regions, min_ts, x).await + } + } + } +} + +#[derive(Debug)] +pub enum RegionSet { + /// The universal set. + Universal, + /// A subset. + Regions(HashSet<(u64, u64)>), +} + +pub enum RegionCheckpointOperation { + Flush, + PrepareMinTsForResolve, + Resolve { + min_ts: TimeStamp, + start_time: Instant, + }, + Resolved { + checkpoints: Vec, + start_time: Instant, + }, + Get(RegionSet, Box) + Send>), + Subscribe(Subscription), +} + +impl fmt::Debug for RegionCheckpointOperation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Flush => f.debug_tuple("Flush").finish(), + Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), + + Self::Subscribe(_) => f.debug_tuple("Subscription").finish(), + Self::Resolved { checkpoints, .. } => { + f.debug_tuple("Resolved").field(checkpoints).finish() + } + Self::PrepareMinTsForResolve => f.debug_tuple("PrepareMinTsForResolve").finish(), + Self::Resolve { min_ts, .. } => { + f.debug_struct("Resolve").field("min_ts", min_ts).finish() + } + } + } +} + pub enum Task { WatchTask(TaskOp), BatchEvent(Vec), - ChangeConfig(ConfigChange), - /// Flush the task with name. - Flush(String), + ChangeConfig(BackupStreamConfig), /// Change the observe status of some region. ModifyObserve(ObserveOp), /// Convert status of some task into `flushing` and do flush then. ForceFlush(String), /// FatalError pauses the task and set the error. - FatalError(String, Box), + FatalError(TaskSelector, Box), /// Run the callback when see this message. Only for test usage. - /// NOTE: Those messages for testing are not guared by `#[cfg(test)]` for now, because - /// the integration test would not enable test config when compiling (why?) + /// NOTE: Those messages for testing are not guarded by `#[cfg(test)]` for + /// now, because the integration test would not enable test config when + /// compiling (why?) Sync( // Run the closure if ... Box, // This returns `true`. Box bool + Send>, ), + /// Mark the store as a failover store. + /// This would prevent store from updating its checkpoint ts for a while. + /// Because we are not sure whether the regions in the store have new leader + /// -- we keep a safe checkpoint so they can choose a safe `from_ts` for + /// initial scanning. + MarkFailover(Instant), + /// Flush the task with name. + Flush(String), + /// Execute the flush with the calculated `min_ts`. + /// This is an internal command only issued by the `Flush` task. + FlushWithMinTs(String, TimeStamp), + /// The command for getting region checkpoints. + RegionCheckpointsOp(RegionCheckpointOperation), + /// update global-checkpoint-ts to storage. + UpdateGlobalCheckpoint(String), } #[derive(Debug)] @@ -1056,19 +1145,21 @@ pub enum TaskOp { ResumeTask(String), } -#[derive(Debug)] +/// The callback for resolving region. +type ResolveRegionsCallback = Box; + pub enum ObserveOp { Start { region: Region, - // if `true`, would scan and sink change from the global checkpoint ts. - // Note: maybe we'd better make it Option to make it more generic, - // but that needs the `observer` know where the checkpoint is, which is a little dirty... - needs_initial_scanning: bool, }, Stop { region: Region, }, - CheckEpochAndStop { + /// Destroy the region subscription. + /// Unlike `Stop`, this will assume the region would never go back. + /// For now, the effect of "never go back" is that we won't try to hint + /// other store the checkpoint ts of this region. + Destroy { region: Region, }, RefreshResolver { @@ -1078,9 +1169,54 @@ pub enum ObserveOp { region: Region, handle: ObserveHandle, err: Box, + has_failed_for: u8, + }, + ResolveRegions { + callback: ResolveRegionsCallback, + min_ts: TimeStamp, }, } +impl std::fmt::Debug for ObserveOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Start { region } => f + .debug_struct("Start") + .field("region", &utils::debug_region(region)) + .finish(), + Self::Stop { region } => f + .debug_struct("Stop") + .field("region", &utils::debug_region(region)) + .finish(), + Self::Destroy { region } => f + .debug_struct("Destroy") + .field("region", &utils::debug_region(region)) + .finish(), + Self::RefreshResolver { region } => f + .debug_struct("RefreshResolver") + .field("region", &utils::debug_region(region)) + .finish(), + Self::NotifyFailToStartObserve { + region, + handle, + err, + has_failed_for, + } => f + .debug_struct("NotifyFailToStartObserve") + .field("region", &utils::debug_region(region)) + .field("handle", handle) + .field("err", err) + .field("has_failed_for", has_failed_for) + .finish(), + Self::ResolveRegions { min_ts, .. } => f + .debug_struct("ResolveRegions") + .field("min_ts", min_ts) + .field("callback", &format_args!("fn {{ .. }}")) + .finish(), + } + } +} + impl fmt::Debug for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -1097,6 +1233,19 @@ impl fmt::Debug for Task { f.debug_tuple("FatalError").field(task).field(err).finish() } Self::Sync(..) => f.debug_tuple("Sync").finish(), + Self::MarkFailover(t) => f + .debug_tuple("MarkFailover") + .field(&format_args!("{:?} ago", t.saturating_elapsed())) + .finish(), + Self::FlushWithMinTs(arg0, arg1) => f + .debug_tuple("FlushWithMinTs") + .field(arg0) + .field(arg1) + .finish(), + Self::RegionCheckpointsOp(s) => f.debug_tuple("GetRegionCheckpoints").field(s).finish(), + Self::UpdateGlobalCheckpoint(task) => { + f.debug_tuple("UpdateGlobalCheckpoint").field(task).finish() + } } } } @@ -1122,13 +1271,18 @@ impl Task { Task::ModifyObserve(o) => match o { ObserveOp::Start { .. } => "modify_observe.start", ObserveOp::Stop { .. } => "modify_observe.stop", - ObserveOp::CheckEpochAndStop { .. } => "modify_observe.check_epoch_and_stop", + ObserveOp::Destroy { .. } => "modify_observe.destroy", ObserveOp::RefreshResolver { .. } => "modify_observe.refresh_resolver", ObserveOp::NotifyFailToStartObserve { .. } => "modify_observe.retry", + ObserveOp::ResolveRegions { .. } => "modify_observe.resolve", }, - Task::ForceFlush(_) => "force_flush", + Task::ForceFlush(..) => "force_flush", Task::FatalError(..) => "fatal_error", Task::Sync(..) => "sync", + Task::MarkFailover(_) => "mark_failover", + Task::FlushWithMinTs(..) => "flush_with_min_ts", + Task::RegionCheckpointsOp(..) => "get_checkpoints", + Task::UpdateGlobalCheckpoint(..) => "update_global_checkpoint", } } } @@ -1138,7 +1292,7 @@ where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, PDC: PdClient + 'static, { type Task = Task; @@ -1147,3 +1301,51 @@ where self.run_task(task) } } + +#[cfg(test)] +mod test { + use engine_rocks::RocksEngine; + use raftstore::{ + coprocessor::region_info_accessor::MockRegionInfoProvider, router::CdcRaftRouter, + }; + use test_raftstore::MockRaftStoreRouter; + use tikv_util::worker::dummy_scheduler; + + use crate::{ + checkpoint_manager::tests::MockPdClient, endpoint, endpoint::Endpoint, metadata::test, Task, + }; + + #[tokio::test] + async fn test_start() { + let cli = test::test_meta_cli(); + let (sched, mut rx) = dummy_scheduler(); + let task = test::simple_task("simple_3"); + cli.insert_task_with_range(&task, &[]).await.unwrap(); + + fail::cfg("failed_to_get_tasks", "1*return").unwrap(); + Endpoint::< + _, + MockRegionInfoProvider, + RocksEngine, + CdcRaftRouter, + MockPdClient, + >::start_and_watch_tasks(cli, sched) + .await + .unwrap(); + fail::remove("failed_to_get_tasks"); + + let _t1 = rx.recv().unwrap(); + let t2 = rx.recv().unwrap(); + + match t2 { + Task::WatchTask(t) => match t { + endpoint::TaskOp::AddTask(t) => { + assert_eq!(t.info, task.info); + assert!(!t.is_paused); + } + _ => panic!("not match TaskOp type"), + }, + _ => panic!("not match Task type {:?}", t2), + } + } +} diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index a4d4515c213..c3cc91da9ff 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -5,7 +5,9 @@ use std::{ }; use error_code::ErrorCodeExt; +#[cfg(feature = "metastore-etcd")] use etcd_client::Error as EtcdError; +use grpcio::Error as GrpcError; use kvproto::{errorpb::Error as StoreError, metapb::*}; use pd_client::Error as PdError; use protobuf::ProtobufError; @@ -18,8 +20,11 @@ use crate::{endpoint::Task, metrics}; #[derive(ThisError, Debug)] pub enum Error { + #[error("gRPC meet error {0}")] + Grpc(#[from] GrpcError), + #[cfg(feature = "metasotre-etcd")] #[error("Etcd meet error {0}")] - Etcd(#[from] EtcdError), + Etcd(#[from] EtcdErrorExt), #[error("Protobuf meet error {0}")] Protobuf(#[from] ProtobufError), #[error("No such task {task_name:?}")] @@ -49,10 +54,29 @@ pub enum Error { Other(#[from] Box), } +#[cfg(feature = "metastore-etcd")] +impl From for Error { + fn from(value: EtcdError) -> Self { + Self::Etcd(value.into()) + } +} + +#[cfg(feature = "metastore-etcd")] +#[derive(ThisError, Debug)] +pub enum EtcdErrorExt { + #[error("{0}")] + Normal(#[from] EtcdError), + #[error("the watch canceled")] + WatchCanceled, + #[error("the required revision has been compacted, current is {current}")] + RevisionCompacted { current: i64 }, +} + impl ErrorCodeExt for Error { fn error_code(&self) -> error_code::ErrorCode { use error_code::backup_stream::*; match self { + #[cfg(feature = "metastore-etcd")] Error::Etcd(_) => ETCD, Error::Protobuf(_) => PROTO, Error::NoSuchTask { .. } => NO_SUCH_TASK, @@ -66,6 +90,7 @@ impl ErrorCodeExt for Error { Error::Other(_) => OTHER, Error::RaftStore(_) => RAFTSTORE, Error::ObserveCanceled(..) => OBSERVE_CANCELED, + Error::Grpc(_) => GRPC, } } } @@ -115,12 +140,31 @@ where } } +pub trait ReportableResult { + fn report_if_err(self, context: impl ToString); +} + +impl ReportableResult for StdResult<(), E> +where + Error: From, +{ + #[inline(always)] + fn report_if_err(self, context: impl ToString) { + if let Err(err) = self { + Error::from(err).report(context.to_string()) + } + } +} + /// Like `errors.Annotate` in Go. /// Wrap an unknown error with [`Error::Other`]. #[macro_export(crate)] macro_rules! annotate { ($inner: expr, $message: expr) => { - Error::Other(tikv_util::box_err!("{}: {}", $message, $inner)) + { + use tikv_util::box_err; + $crate::errors::Error::Other(box_err!("{}: {}", $message, $inner)) + } }; ($inner: expr, $format: literal, $($args: expr),+) => { annotate!($inner, format_args!($format, $($args),+)) @@ -129,14 +173,14 @@ macro_rules! annotate { impl Error { pub fn report(&self, context: impl Display) { - warn!("backup stream meet error"; "context" => %context, "err" => %self); + warn!("backup stream meet error"; "context" => %context, "err" => %self, "verbose_err" => ?self); metrics::STREAM_ERROR .with_label_values(&[self.kind()]) .inc() } pub fn report_fatal(&self) { - error!(%self; "backup stream meet fatal error"); + error!(%self; "backup stream meet fatal error"; "verbose" => ?self, ); metrics::STREAM_FATAL_ERROR .with_label_values(&[self.kind()]) .inc() @@ -282,8 +326,9 @@ mod test { b.iter(|| { let result: Result<()> = Ok(()); let lucky_number = rand::random::(); - let result = result.context_with(|| format!("lucky: the number is {}", lucky_number)); - assert!(result.is_ok()); + result + .context_with(|| format!("lucky: the number is {}", lucky_number)) + .unwrap(); }) } } diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index d791ce6a825..1b663c0e982 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -1,18 +1,14 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{atomic::Ordering, Arc}, - time::Duration, -}; +use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::CmdType}; use raftstore::{ - coprocessor::RegionInfoProvider, - router::RaftStoreRouter, - store::{fsm::ChangeObserver, Callback, SignificantMsg}, + coprocessor::{ObserveHandle, RegionInfoProvider}, + router::CdcHandle, + store::{fsm::ChangeObserver, Callback}, }; use tikv::storage::{ kv::StatisticsSummary, @@ -20,8 +16,15 @@ use tikv::storage::{ txn::{EntryBatch, TxnEntry, TxnEntryScanner}, Snapshot, Statistics, }; -use tikv_util::{box_err, time::Instant, warn, worker::Scheduler}; -use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tikv_util::{ + box_err, + time::{Instant, Limiter}, + worker::Scheduler, +}; +use tokio::{ + runtime::Handle, + sync::{OwnedSemaphorePermit, Semaphore}, +}; use txn_types::{Key, Lock, TimeStamp}; use crate::{ @@ -30,13 +33,13 @@ use crate::{ errors::{ContextualResultExt, Error, Result}, metrics, router::{ApplyEvent, ApplyEvents, Router}, - subscription_track::{SubscriptionTracer, TwoPhaseResolver}, + subscription_track::{Ref, RefMut, SubscriptionTracer, TwoPhaseResolver}, try_send, utils::{self, RegionPager}, Task, }; -const MAX_GET_SNAPSHOT_RETRY: usize = 3; +const MAX_GET_SNAPSHOT_RETRY: usize = 5; #[derive(Clone)] pub struct PendingMemoryQuota(Arc); @@ -59,7 +62,7 @@ impl PendingMemoryQuota { pub fn pending(&self, size: usize) -> PendingMemory { PendingMemory( - tokio::runtime::Handle::current() + Handle::current() .block_on(self.0.clone().acquire_many_owned(size as _)) .expect("BUG: the semaphore is closed unexpectedly."), ) @@ -69,8 +72,12 @@ impl PendingMemoryQuota { /// EventLoader transforms data from the snapshot into ApplyEvent. pub struct EventLoader { scanner: DeltaScanner, + // pooling the memory. + entry_batch: EntryBatch, } +const ENTRY_BATCH_SIZE: usize = 1024; + impl EventLoader { pub fn load_from( snapshot: S, @@ -81,8 +88,8 @@ impl EventLoader { let region_id = region.get_id(); let scanner = ScannerBuilder::new(snapshot, to_ts) .range( - Some(Key::from_encoded_slice(®ion.start_key)), - Some(Key::from_encoded_slice(®ion.end_key)), + (!region.start_key.is_empty()).then(|| Key::from_encoded_slice(®ion.start_key)), + (!region.end_key.is_empty()).then(|| Key::from_encoded_slice(®ion.end_key)), ) .hint_min_ts(Some(from_ts)) .fill_cache(false) @@ -93,20 +100,32 @@ impl EventLoader { from_ts, to_ts, region_id ))?; - Ok(Self { scanner }) + Ok(Self { + scanner, + entry_batch: EntryBatch::with_capacity(ENTRY_BATCH_SIZE), + }) } - /// scan a batch of events from the snapshot. Tracking the locks at the same time. - /// note: maybe make something like [`EntryBatch`] for reducing allocation. - fn scan_batch( + /// Scan a batch of events from the snapshot, and save them into the + /// internal buffer. + fn fill_entries(&mut self) -> Result { + assert!( + self.entry_batch.is_empty(), + "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `omit_entries` first. (len = {})", + self.entry_batch.len() + ); + self.scanner.scan_entries(&mut self.entry_batch)?; + Ok(self.scanner.take_statistics()) + } + + /// Drain the internal buffer, converting them to the [`ApplyEvents`], + /// and tracking the locks at the same time. + fn emit_entries_to( &mut self, - batch_size: usize, result: &mut ApplyEvents, resolver: &mut TwoPhaseResolver, - ) -> Result { - let mut b = EntryBatch::with_capacity(batch_size); - self.scanner.scan_entries(&mut b)?; - for entry in b.drain() { + ) -> Result<()> { + for entry in self.entry_batch.drain() { match entry { TxnEntry::Prewrite { default: (key, value), @@ -129,7 +148,9 @@ impl EventLoader { ) })?; debug!("meet lock during initial scanning."; "key" => %utils::redact(&lock_at), "ts" => %lock.ts); - resolver.track_phase_one_lock(lock.ts, lock_at) + if utils::should_track_lock(&lock) { + resolver.track_phase_one_lock(lock.ts, lock_at); + } } TxnEntry::Commit { default, write, .. } => { result.push(ApplyEvent { @@ -149,25 +170,29 @@ impl EventLoader { } } } - Ok(self.scanner.take_statistics()) + Ok(()) } } /// The context for loading incremental data between range. /// Like [`cdc::Initializer`], but supports initialize over range. /// Note: maybe we can merge those two structures? +/// Note': maybe extract more fields to trait so it would be easier to test. #[derive(Clone)] pub struct InitialDataLoader { - router: RT, - regions: R, // Note: maybe we can make it an abstract thing like `EventSink` with // method `async (KvEvent) -> Result<()>`? - sink: Router, - tracing: SubscriptionTracer, - scheduler: Scheduler, - quota: PendingMemoryQuota, - handle: tokio::runtime::Handle, + pub(crate) sink: Router, + pub(crate) tracing: SubscriptionTracer, + pub(crate) scheduler: Scheduler, + // Note: this is only for `init_range`, maybe make it an argument? + pub(crate) regions: R, + // Note: Maybe move those fields about initial scanning into some trait? + pub(crate) router: RT, + pub(crate) quota: PendingMemoryQuota, + pub(crate) limit: Limiter, + pub(crate) handle: Handle, _engine: PhantomData, } @@ -175,7 +200,7 @@ impl InitialDataLoader where E: KvEngine, R: RegionInfoProvider + Clone + 'static, - RT: RaftStoreRouter, + RT: CdcHandle, { pub fn new( router: RT, @@ -184,7 +209,8 @@ where tracing: SubscriptionTracer, sched: Scheduler, quota: PendingMemoryQuota, - handle: tokio::runtime::Handle, + handle: Handle, + limiter: Limiter, ) -> Self { Self { router, @@ -195,6 +221,7 @@ where _engine: PhantomData, quota, handle, + limit: limiter, } } @@ -205,7 +232,8 @@ where ) -> Result { let mut last_err = None; for _ in 0..MAX_GET_SNAPSHOT_RETRY { - let r = self.observe_over(region, cmd()); + let c = cmd(); + let r = self.observe_over(region, c); match r { Ok(s) => { return Ok(s); @@ -215,24 +243,29 @@ where Error::RaftRequest(pbe) => { !(pbe.has_epoch_not_match() || pbe.has_not_leader() - || pbe.get_message().contains("stale observe id")) + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) } Error::RaftStore(raftstore::Error::RegionNotFound(_)) | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, _ => true, }; + e.report(format_args!( + "during getting initial snapshot for region {:?}; can retry = {}", + region, can_retry + )); last_err = match last_err { None => Some(e), Some(err) => Some(Error::Contextual { - context: format!("and error {}", e), - inner_error: Box::new(err), + context: format!("and error {}", err), + inner_error: Box::new(e), }), }; if !can_retry { break; } - std::thread::sleep(Duration::from_millis(500)); + std::thread::sleep(Duration::from_secs(1)); continue; } } @@ -245,41 +278,43 @@ where /// and return the current snapshot of that region. fn observe_over(&self, region: &Region, cmd: ChangeObserver) -> Result { // There are 2 ways for getting the initial snapshot of a region: - // 1. the BR method: use the interface in the RaftKv interface, read the key-values directly. - // 2. the CDC method: use the raftstore message `SignificantMsg::CaptureChange` to - // register the region to CDC observer and get a snapshot at the same time. - // Registering the observer to the raftstore is necessary because we should only listen events from leader. - // In CDC, the change observer is per-delegate(i.e. per-region), we can create the command per-region here too. + // - the BR method: use the interface in the RaftKv interface, read the + // key-values directly. + // - the CDC method: use the raftstore message `SignificantMsg::CaptureChange` + // to register the region to CDC observer and get a snapshot at the same time. + // Registering the observer to the raftstore is necessary because we should only + // listen events from leader. In CDC, the change observer is + // per-delegate(i.e. per-region), we can create the command per-region here too. let (callback, fut) = tikv_util::future::paired_future_callback::>(); + self.router - .significant_send( - region.id, - SignificantMsg::CaptureChange { - cmd, - region_epoch: region.get_region_epoch().clone(), - callback: Callback::Read(Box::new(|snapshot| { - if snapshot.response.get_header().has_error() { - callback(Err(Error::RaftRequest( - snapshot.response.get_header().get_error().clone(), - ))); - return; - } - if let Some(snap) = snapshot.snapshot { - callback(Ok(snap)); - return; - } - callback(Err(Error::Other(box_err!( - "PROBABLY BUG: the response contains neither error nor snapshot" - )))) - })), - }, + .capture_change( + region.get_id(), + region.get_region_epoch().clone(), + cmd, + Callback::read(Box::new(|snapshot| { + if snapshot.response.get_header().has_error() { + callback(Err(Error::RaftRequest( + snapshot.response.get_header().get_error().clone(), + ))); + return; + } + if let Some(snap) = snapshot.snapshot { + callback(Ok(snap)); + return; + } + callback(Err(Error::Other(box_err!( + "PROBABLY BUG: the response contains neither error nor snapshot" + )))) + })), ) .context(format_args!( "failed to register the observer to region {}", region.get_id() ))?; + let snap = block_on(fut) .map_err(|err| { annotate!( @@ -297,17 +332,19 @@ where Ok(snap) } - pub fn with_resolver( + fn with_resolver( &self, region: &Region, + handle: &ObserveHandle, f: impl FnOnce(&mut TwoPhaseResolver) -> Result, ) -> Result { - Self::with_resolver_by(&self.tracing, region, f) + Self::with_resolver_by(&self.tracing, region, handle, f) } - pub fn with_resolver_by( + fn with_resolver_by( tracing: &SubscriptionTracer, region: &Region, + handle: &ObserveHandle, f: impl FnOnce(&mut TwoPhaseResolver) -> Result, ) -> Result { let region_id = region.get_id(); @@ -315,19 +352,26 @@ where .get_subscription_of(region_id) .ok_or_else(|| Error::Other(box_err!("observer for region {} canceled", region_id))) .and_then(|v| { + // NOTE: once we have compared the observer handle, perhaps we can remove this + // check because epoch version changed implies observer handle changed. raftstore::store::util::compare_region_epoch( region.get_region_epoch(), &v.value().meta, - // No need for checking conf version because conf change won't cancel the observation. + // No need for checking conf version because conf change won't cancel the + // observation. false, true, false, )?; + if v.value().handle().id != handle.id { + return Err(box_err!("stale observe handle {:?}, should be {:?}, perhaps new initial scanning starts", + handle.id, v.value().handle().id)); + } Ok(v) }) .map_err(|err| Error::Contextual { - // Both when we cannot find the region in the track and - // the epoch has changed means that we should cancel the current turn of initial scanning. + // Both when we cannot find the region in the track and the epoch has changed means + // that we should cancel the current turn of initial scanning. inner_error: Box::new(Error::ObserveCanceled( region_id, region.get_region_epoch().clone(), @@ -340,16 +384,33 @@ where fn scan_and_async_send( &self, region: &Region, + handle: &ObserveHandle, mut event_loader: EventLoader, join_handles: &mut Vec>, ) -> Result { let mut stats = StatisticsSummary::default(); let start = Instant::now(); loop { + fail::fail_point!("scan_and_async_send", |msg| Err(Error::Other(box_err!( + "{:?}", msg + )))); let mut events = ApplyEvents::with_capacity(1024, region.id); - let stat = - self.with_resolver(region, |r| event_loader.scan_batch(1024, &mut events, r))?; - if events.is_empty() { + // Note: the call of `fill_entries` is the only step which would read the disk. + // we only need to record the disk throughput of this. + let (stat, disk_read) = + utils::with_record_read_throughput(|| event_loader.fill_entries()); + // We must use the size of entry batch here to check whether we have progress. + // Or we may exit too early if there are only records: + // - can be inlined to `write` CF (hence it won't be written to default CF) + // - are prewritten. (hence it will only contains `Prewrite` records). + // In this condition, ALL records generate no ApplyEvent(only lock change), + // and we would exit after the first run of loop :( + let no_progress = event_loader.entry_batch.is_empty(); + let stat = stat?; + self.with_resolver(region, handle, |r| { + event_loader.emit_entries_to(&mut events, r) + })?; + if no_progress { metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); return Ok(stats.stat); } @@ -359,8 +420,10 @@ where let event_size = events.size(); let sched = self.scheduler.clone(); let permit = self.quota.pending(event_size); + self.limit.blocking_consume(disk_read as _); debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); + metrics::INCREMENTAL_SCAN_DISK_READ.inc_by(disk_read as f64); metrics::HEAP_MEMORY.add(event_size as _); join_handles.push(tokio::spawn(async move { utils::handle_on_event_result(&sched, sink.on_events(events).await); @@ -374,42 +437,39 @@ where pub fn do_initial_scan( &self, region: &Region, + // We are using this handle for checking whether the initial scan is stale. + handle: ObserveHandle, start_ts: TimeStamp, snap: impl Snapshot, ) -> Result { let _guard = self.handle.enter(); - // It is ok to sink more data than needed. So scan to +inf TS for convenance. - let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; let tr = self.tracing.clone(); let region_id = region.get_id(); let mut join_handles = Vec::with_capacity(8); - let stats = self.scan_and_async_send(region, event_loader, &mut join_handles); - - // we should mark phase one as finished whether scan successed. - // TODO: use an `WaitGroup` with asynchronous support. - let r = region.clone(); - tokio::spawn(async move { - for h in join_handles { - if let Err(err) = h.await { - warn!("failed to join task."; "err" => %err); - } - } - let result = Self::with_resolver_by(&tr, &r, |r| { - r.phase_one_done(); - Ok(()) - }); - if let Err(err) = result { - err.report(format_args!( - "failed to finish phase 1 for region {:?}", - region_id - )); - } - }); - stats + + // It is ok to sink more data than needed. So scan to +inf TS for convenance. + let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; + let stats = self.scan_and_async_send(region, &handle, event_loader, &mut join_handles)?; + + Handle::current() + .block_on(futures::future::try_join_all(join_handles)) + .map_err(|err| annotate!(err, "tokio runtime failed to join consuming threads"))?; + + Self::with_resolver_by(&tr, region, &handle, |r| { + r.phase_one_done(); + Ok(()) + }) + .context(format_args!( + "failed to finish phase 1 for region {:?}", + region_id + ))?; + + Ok(stats) } - /// initialize a range: it simply scan the regions with leader role and send them to [`initialize_region`]. + /// initialize a range: it simply scan the regions with leader role and send + /// them to [`initialize_region`]. pub fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { let mut pager = RegionPager::scan_from(self.regions.clone(), start_key, end_key); loop { @@ -419,22 +479,65 @@ where break; } for r in regions { - // Note: Even we did the initial scanning, and blocking resolved ts from advancing, - // if the next_backup_ts was updated in some extreme condition, there is still little chance to lost data: - // For example, if a region cannot elect the leader for long time. (say, net work partition) - // At that time, we have nowhere to record the lock status of this region. - let success = try_send!( + // Note: Even we did the initial scanning, and blocking resolved ts from + // advancing, if the next_backup_ts was updated in some extreme condition, there + // is still little chance to lost data: For example, if a region cannot elect + // the leader for long time. (say, net work partition) At that time, we have + // nowhere to record the lock status of this region. + try_send!( self.scheduler, - Task::ModifyObserve(ObserveOp::Start { - region: r.region, - needs_initial_scanning: true - }) + Task::ModifyObserve(ObserveOp::Start { region: r.region }) ); - if success { - crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_add(1, Ordering::SeqCst); - } } } Ok(()) } } + +#[cfg(test)] +mod tests { + use futures::executor::block_on; + use kvproto::metapb::*; + use tikv::storage::{txn::tests::*, TestEngineBuilder}; + use tikv_kv::SnapContext; + use txn_types::TimeStamp; + + use super::EventLoader; + use crate::{ + router::ApplyEvents, subscription_track::TwoPhaseResolver, + utils::with_record_read_throughput, + }; + + #[test] + fn test_disk_read() { + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); + for i in 0..100 { + let owned_key = format!("{:06}", i); + let key = owned_key.as_bytes(); + let owned_value = [i as u8; 512]; + let value = owned_value.as_slice(); + must_prewrite_put(&mut engine, key, value, key, i * 2); + must_commit(&mut engine, key, i * 2, i * 2 + 1); + } + // let compact the memtable to disk so we can see the disk read. + engine.get_rocksdb().as_inner().compact_range(None, None); + + let mut r = Region::new(); + r.set_id(42); + r.set_start_key(b"".to_vec()); + r.set_end_key(b"".to_vec()); + + let snap = block_on(async { tikv_kv::snapshot(&mut engine, SnapContext::default()).await }) + .unwrap(); + let mut loader = + EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); + + let (r, data_load) = with_record_read_throughput(|| loader.fill_entries()); + r.unwrap(); + let mut events = ApplyEvents::with_capacity(1024, 42); + let mut res = TwoPhaseResolver::new(42, None); + loader.emit_entries_to(&mut events, &mut res).unwrap(); + assert_ne!(events.len(), 0); + assert_ne!(data_load, 0); + } +} diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index a19b4b4fc2f..ac7ab1f718f 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -1,17 +1,27 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(slice_group_by)] #![feature(result_flattening)] #![feature(assert_matches)] #![feature(test)] +mod checkpoint_manager; pub mod config; mod endpoint; pub mod errors; mod event_loader; pub mod metadata; -mod metrics; +pub(crate) mod metrics; pub mod observer; pub mod router; +mod service; +mod subscription_manager; mod subscription_track; -mod utils; +// Publish it for integration test. +// Perhaps we'd better move some of then into `tikv_util`. +pub mod utils; -pub use endpoint::{Endpoint, ObserveOp, Task}; +pub use checkpoint_manager::GetCheckpointResult; +pub use endpoint::{ + BackupStreamResolver, Endpoint, ObserveOp, RegionCheckpointOperation, RegionSet, Task, +}; +pub use service::Service; diff --git a/components/backup-stream/src/metadata/checkpoint_cache.rs b/components/backup-stream/src/metadata/checkpoint_cache.rs new file mode 100644 index 00000000000..50573d003d8 --- /dev/null +++ b/components/backup-stream/src/metadata/checkpoint_cache.rs @@ -0,0 +1,71 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use tikv_util::time::Instant; +use txn_types::TimeStamp; + +/// The lease time of a checkpoint. +/// 12s is the default interval of the coornaditor tick. +const CACHE_LEASE_TIME: Duration = Duration::from_secs(12); + +pub struct CheckpointCache { + last_access: Instant, + checkpoint: TimeStamp, + + cache_lease_time: Duration, +} + +impl Default for CheckpointCache { + fn default() -> Self { + Self { + last_access: Instant::now_coarse(), + checkpoint: TimeStamp::zero(), + + cache_lease_time: CACHE_LEASE_TIME, + } + } +} + +impl CheckpointCache { + #[cfg(test)] + pub fn with_cache_lease(lease: Duration) -> Self { + Self { + cache_lease_time: lease, + ..Self::default() + } + } + + pub fn update(&mut self, checkpoint: impl Into) { + self.last_access = Instant::now_coarse(); + self.checkpoint = self.checkpoint.max(checkpoint.into()) + } + + pub fn get(&self) -> Option { + if self.checkpoint.is_zero() + || self.last_access.saturating_elapsed() > self.cache_lease_time + { + return None; + } + Some(self.checkpoint) + } +} + +#[cfg(test)] +mod test { + use std::time::Duration; + + use super::CheckpointCache; + + #[test] + fn test_basic() { + let mut c = CheckpointCache::with_cache_lease(Duration::from_millis(100)); + assert_eq!(c.get(), None); + c.update(42); + assert_eq!(c.get(), Some(42.into())); + c.update(41); + assert_eq!(c.get(), Some(42.into())); + std::thread::sleep(Duration::from_millis(200)); + assert_eq!(c.get(), None); + } +} diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 5f0e8b85bed..1fdc1b3b1e8 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -1,23 +1,34 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::HashMap, fmt::Debug}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path, sync::Arc}; -use kvproto::brpb::{StreamBackupError, StreamBackupTaskInfo}; +use dashmap::DashMap; +use kvproto::{ + brpb::{StreamBackupError, StreamBackupTaskInfo}, + metapb::Region, +}; use tikv_util::{defer, time::Instant, warn}; use tokio_stream::StreamExt; +use txn_types::TimeStamp; use super::{ + checkpoint_cache::CheckpointCache, keys::{self, KeyValue, MetaKey}, store::{ - GetExtra, Keys, KvEvent, KvEventType, MetaStore, Snapshot, Subscription, WithRevision, + CondTransaction, Condition, Keys, KvEvent, KvEventType, MetaStore, Snapshot, Subscription, + Transaction, WithRevision, }, }; -use crate::errors::{Error, Result}; +use crate::{ + debug, + errors::{ContextualResultExt, Error, Result}, +}; /// Some operations over stream backup metadata key space. #[derive(Clone)] pub struct MetadataClient { store_id: u64, + caches: Arc>, pub(crate) meta_store: Store, } @@ -37,6 +48,7 @@ impl Debug for StreamTask { .field("table_filter", &self.info.table_filter) .field("start_ts", &self.info.start_ts) .field("end_ts", &self.info.end_ts) + .field("is_paused", &self.is_paused) .finish() } } @@ -64,6 +76,115 @@ impl PartialEq for MetadataEvent { } } +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum CheckpointProvider { + Store(u64), + Region { id: u64, version: u64 }, + Task, + Global, +} + +/// The polymorphic checkpoint. +/// The global checkpoint should be the minimal checkpoint of all checkpoints. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Checkpoint { + pub provider: CheckpointProvider, + pub ts: TimeStamp, +} + +impl Checkpoint { + pub fn from_kv(kv: &KeyValue) -> Result { + match std::str::from_utf8(kv.0.0.as_slice()) { + Ok(key) => Checkpoint::parse_from(Path::new(key), kv.1.as_slice()), + Err(_) => { + Ok(Checkpoint { + // The V1 checkpoint, maybe fill the store id? + provider: CheckpointProvider::Store(0), + ts: TimeStamp::new(parse_ts_from_bytes(kv.1.as_slice())?), + }) + } + } + } + + pub fn parse_from(path: &Path, checkpoint_ts: &[u8]) -> Result { + let segs = path.iter().map(|os| os.to_str()).collect::>(); + match segs.as_slice() { + [ + // We always use '/' as the path. + // NOTE: Maybe just `split` and don't use `path`? + Some("/"), + Some("tidb"), + Some("br-stream"), + Some("checkpoint"), + Some(_task_name), + Some("region"), + Some(id), + Some(epoch), + .., + ] => Self::from_region_parse_result(id, epoch, checkpoint_ts) + .context(format_args!("during parsing key {}", path.display())), + [ + // We always use '/' as the path. + // NOTE: Maybe just `split` and don't use `path`? + Some("/"), + Some("tidb"), + Some("br-stream"), + Some("checkpoint"), + Some(_task_name), + Some("store"), + Some(id), + .., + ] => Self::from_store_parse_result(id, checkpoint_ts) + .context(format_args!("during parsing key {}", path.display())), + [ + // We always use '/' as the path. + // NOTE: Maybe just `split` and don't use `path`? + Some("/"), + Some("tidb"), + Some("br-stream"), + Some("checkpoint"), + Some(_task_name), + Some("central_global"), + ] => Ok(Self { + provider: CheckpointProvider::Global, + ts: TimeStamp::new(parse_ts_from_bytes(checkpoint_ts)?), + }), + _ => Err(Error::MalformedMetadata(format!( + "cannot parse path {}(segs = {:?}) as checkpoint", + path.display(), + segs + ))), + } + } + + fn from_store_parse_result(id: &str, checkpoint_ts: &[u8]) -> Result { + let provider_id = id + .parse::() + .map_err(|err| Error::MalformedMetadata(err.to_string()))?; + let provider = CheckpointProvider::Store(provider_id); + let checkpoint = TimeStamp::new(parse_ts_from_bytes(checkpoint_ts)?); + Ok(Self { + provider, + ts: checkpoint, + }) + } + + fn from_region_parse_result(id: &str, version: &str, checkpoint_ts: &[u8]) -> Result { + let id = id + .parse::() + .map_err(|err| Error::MalformedMetadata(err.to_string()))?; + let version = version + .parse::() + .map_err(|err| Error::MalformedMetadata(err.to_string()))?; + let checkpoint = TimeStamp::new(parse_ts_from_bytes(checkpoint_ts)?); + let provider = CheckpointProvider::Region { id, version }; + Ok(Self { + provider, + ts: checkpoint, + }) + } +} + impl MetadataEvent { fn from_watch_event(event: &KvEvent) -> Option { // Maybe report an error when the kv isn't present? @@ -122,10 +243,34 @@ impl MetadataClient { pub fn new(store: Store, store_id: u64) -> Self { Self { meta_store: store, + caches: Arc::default(), store_id, } } + /// Initialize a task: execute some general operations over the keys. + /// For now, it sets the checkpoint ts if there isn't one for the current + /// store. + pub async fn init_task(&self, task: &StreamBackupTaskInfo) -> Result<()> { + let if_present = Condition::new( + MetaKey::next_backup_ts_of(&task.name, self.store_id), + Ordering::Greater, + vec![], + ); + let txn = CondTransaction::new( + if_present, + Transaction::default(), + Transaction::default().put(KeyValue( + MetaKey::next_backup_ts_of(&task.name, self.store_id), + task.get_start_ts().to_be_bytes().to_vec(), + )), + ); + self.meta_store.txn_cond(txn).await + } + + /// Upload the last error information to the etcd. + /// This won't pause the task. Even this method would usually be paired with + /// `pause`. pub async fn report_last_error(&self, name: &str, last_error: StreamBackupError) -> Result<()> { use protobuf::Message; let now = Instant::now(); @@ -148,8 +293,7 @@ impl MetadataClient { ) -> Result> { let key = MetaKey::last_error_of(name, store_id); - let s = self.meta_store.snapshot().await?; - let r = s.get(Keys::Key(key)).await?; + let r = self.meta_store.get_latest(Keys::Key(key)).await?.inner; if r.is_empty() { return Ok(None); } @@ -160,8 +304,11 @@ impl MetadataClient { /// check whether the task is paused. pub async fn check_task_paused(&self, name: &str) -> Result { - let snap = self.meta_store.snapshot().await?; - let kvs = snap.get(Keys::Key(MetaKey::pause_of(name))).await?; + let kvs = self + .meta_store + .get_latest(Keys::Key(MetaKey::pause_of(name))) + .await? + .inner; Ok(!kvs.is_empty()) } @@ -173,8 +320,11 @@ impl MetadataClient { } pub async fn get_tasks_pause_status(&self) -> Result, bool>> { - let snap = self.meta_store.snapshot().await?; - let kvs = snap.get(Keys::Prefix(MetaKey::pause_prefix())).await?; + let kvs = self + .meta_store + .get_latest(Keys::Prefix(MetaKey::pause_prefix())) + .await? + .inner; let mut pause_hash = HashMap::new(); let prefix_len = MetaKey::pause_prefix_len(); @@ -194,10 +344,9 @@ impl MetadataClient { } let items = self .meta_store - .snapshot() + .get_latest(Keys::Key(MetaKey::task_of(name))) .await? - .get(Keys::Key(MetaKey::task_of(name))) - .await?; + .inner; if items.is_empty() { return Ok(None); } @@ -213,11 +362,18 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_fetch"]).observe(now.saturating_elapsed().as_secs_f64()) } - let snap = self.meta_store.snapshot().await?; - let kvs = snap.get(Keys::Prefix(MetaKey::tasks())).await?; + fail::fail_point!("failed_to_get_tasks", |_| { + Err(Error::MalformedMetadata( + "faild to connect etcd client".to_string(), + )) + }); + let kvs = self + .meta_store + .get_latest(Keys::Prefix(MetaKey::tasks())) + .await?; - let mut tasks = Vec::with_capacity(kvs.len()); - for kv in kvs { + let mut tasks = Vec::with_capacity(kvs.inner.len()); + for kv in kvs.inner { let t = protobuf::parse_from_bytes::(kv.value())?; let paused = self.check_task_paused(t.get_name()).await?; tasks.push(StreamTask { @@ -227,7 +383,7 @@ impl MetadataClient { } Ok(WithRevision { inner: tasks, - revision: snap.revision(), + revision: kvs.revision, }) } @@ -238,7 +394,8 @@ impl MetadataClient { } /// watch event stream from the revision(exclusive). - /// the revision would usually come from a WithRevision struct(which indices the revision of the inner item). + /// the revision would usually come from a WithRevision struct(which indices + /// the revision of the inner item). pub async fn events_from(&self, revision: i64) -> Result> { let watcher = self .meta_store @@ -269,7 +426,10 @@ impl MetadataClient { let stream = watcher .stream .filter_map(|item| match item { - Ok(kv_event) => MetadataEvent::from_watch_pause_event(&kv_event), + Ok(kv_event) => { + debug!("watch pause event"; "raw" => ?kv_event); + MetadataEvent::from_watch_pause_event(&kv_event) + } Err(err) => Some(MetadataEvent::Error { err }), }) .map(|event| { @@ -283,160 +443,147 @@ impl MetadataClient { }) } - /// forward the progress of some task. - pub async fn step_task(&self, task_name: &str, ts: u64) -> Result<()> { + /// Set the storage checkpoint to metadata. + pub async fn set_storage_checkpoint(&self, task_name: &str, ts: u64) -> Result<()> { let now = Instant::now(); defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["storage_checkpoint"]).observe(now.saturating_elapsed().as_secs_f64()) } self.meta_store .set(KeyValue( - MetaKey::next_backup_ts_of(task_name, self.store_id), + MetaKey::storage_checkpoint_of(task_name, self.store_id), ts.to_be_bytes().to_vec(), )) .await?; Ok(()) } - /// get all target ranges of some task. - pub async fn ranges_of_task( - &self, - task_name: &str, - ) -> Result, Vec)>>> { - let snap = self.meta_store.snapshot().await?; - let ranges = snap - .get(Keys::Prefix(MetaKey::ranges_of(task_name))) - .await?; + /// Get the storage checkpoint from metadata. This function is justly used + /// for test. + pub async fn get_storage_checkpoint(&self, task_name: &str) -> Result { + let now = Instant::now(); + defer! { + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) + } + let ts = self + .meta_store + .get_latest(Keys::Key(MetaKey::storage_checkpoint_of( + task_name, + self.store_id, + ))) + .await? + .inner; - Ok(WithRevision { - revision: snap.revision(), - inner: ranges - .into_iter() - .map(|mut kv: KeyValue| kv.take_range(task_name)) - .collect(), - }) + match ts.as_slice() { + [ts, ..] => Ok(TimeStamp::new(parse_ts_from_bytes(ts.value())?)), + [] => Ok(self.get_task_start_ts_checkpoint(task_name).await?.ts), + } } - - /// Perform a two-phase bisection search algorithm for the intersection of all ranges - /// and the specificated range (usually region range.) - /// TODO: explain the algorithm? - pub async fn range_overlap_of_task( - &self, - task_name: &str, - (start_key, end_key): (Vec, Vec), - ) -> Result, Vec)>>> { + /// forward the progress of some task. + pub async fn set_local_task_checkpoint(&self, task_name: &str, ts: u64) -> Result<()> { let now = Instant::now(); defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_range_search"]).observe(now.saturating_elapsed().as_secs_f64()) + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) } - let snap = self.meta_store.snapshot().await?; - - let mut prev = snap - .get_extra( - Keys::Range( - MetaKey::ranges_of(task_name), - MetaKey::range_of(task_name, &start_key), - ), - GetExtra { - desc_order: true, - limit: 1, - ..Default::default() - }, - ) - .await?; - let all = snap - .get(Keys::Range( - MetaKey::range_of(task_name, &start_key), - MetaKey::range_of(task_name, &end_key), + self.meta_store + .set(KeyValue( + MetaKey::next_backup_ts_of(task_name, self.store_id), + ts.to_be_bytes().to_vec(), )) .await?; - - let mut result = Vec::with_capacity(all.len() as usize + 1); - if !prev.kvs.is_empty() { - let kv = &mut prev.kvs[0]; - if kv.value() > start_key.as_slice() { - result.push(kv.take_range(task_name)); - } - } - for mut kv in all { - result.push(kv.take_range(task_name)); - } - Ok(WithRevision { - revision: snap.revision(), - inner: result, - }) + Ok(()) } - /// access the next backup ts of some task and some region. - pub async fn progress_of_task(&self, task_name: &str) -> Result { + pub async fn get_local_task_checkpoint(&self, task_name: &str) -> Result { let now = Instant::now(); defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_progress_get"]).observe(now.saturating_elapsed().as_secs_f64()) - } - let task = self.get_task(task_name).await?; - if task.is_none() { - return Err(Error::NoSuchTask { - task_name: task_name.to_owned(), - }); + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) } - - let timestamp = self.meta_store.snapshot().await?; - let items = timestamp - .get(Keys::Key(MetaKey::next_backup_ts_of( + let ts = self + .meta_store + .get_latest(Keys::Key(MetaKey::next_backup_ts_of( task_name, self.store_id, ))) - .await?; - if items.is_empty() { - Ok(task.unwrap().info.start_ts) - } else { - assert_eq!(items.len(), 1); - Self::parse_ts_from_bytes(items[0].1.as_slice()) + .await? + .inner; + + match ts.as_slice() { + [ts, ..] => Ok(TimeStamp::new(parse_ts_from_bytes(ts.value())?)), + [] => Ok(self.get_task_start_ts_checkpoint(task_name).await?.ts), } } - /// get the global progress (the min next_backup_ts among all stores). - pub async fn global_progress_of_task(&self, task_name: &str) -> Result { + /// get all target ranges of some task. + pub async fn ranges_of_task( + &self, + task_name: &str, + ) -> Result, Vec)>>> { + let ranges = self + .meta_store + .get_latest(Keys::Prefix(MetaKey::ranges_of(task_name))) + .await?; + + Ok(ranges.map(|rs| { + rs.into_iter() + .map(|mut kv: KeyValue| kv.take_range(task_name)) + .collect() + })) + } + + pub async fn checkpoints_of(&self, task_name: &str) -> Result> { let now = Instant::now(); defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_progress_get_global"]).observe(now.saturating_elapsed().as_secs_f64()) + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["checkpoints_of"]).observe(now.saturating_elapsed().as_secs_f64()) } - let task = self.get_task(task_name).await?; - if task.is_none() { - return Err(Error::NoSuchTask { - task_name: task_name.to_owned(), - }); - } - - let snap = self.meta_store.snapshot().await?; - let global_ts = snap.get(Keys::Prefix(MetaKey::next_backup_ts(task_name))) + let checkpoints = self.meta_store + .get_latest(Keys::Prefix(MetaKey::next_backup_ts(task_name))) .await? + .inner .iter() .filter_map(|kv| { - Self::parse_ts_from_bytes(kv.1.as_slice()) + Checkpoint::from_kv(kv) .map_err(|err| warn!("br-stream: failed to parse next_backup_ts."; "key" => ?kv.0, "err" => %err)) .ok() }) - .min() - .unwrap_or(task.unwrap().info.start_ts); - Ok(global_ts) + .collect(); + Ok(checkpoints) } - fn parse_ts_from_bytes(next_backup_ts: &[u8]) -> Result { - if next_backup_ts.len() != 8 { - return Err(Error::MalformedMetadata(format!( - "the length of next_backup_ts is {} bytes, require 8 bytes", - next_backup_ts.len() - ))); - } - let mut buf = [0u8; 8]; - buf.copy_from_slice(next_backup_ts); - Ok(u64::from_be_bytes(buf)) + async fn get_task_start_ts_checkpoint(&self, task_name: &str) -> Result { + let task = self + .get_task(task_name) + .await? + .ok_or_else(|| Error::NoSuchTask { + task_name: task_name.to_owned(), + })?; + Ok(Checkpoint { + ts: TimeStamp::new(task.info.start_ts), + provider: CheckpointProvider::Task, + }) + } + + /// Get the global checkpoint of a task. + /// It is the smallest checkpoint of all types of checkpoint. + pub async fn global_checkpoint_of_task(&self, task_name: &str) -> Result { + let cp = match self.global_checkpoint_of(task_name).await? { + Some(cp) => cp, + None => self.get_task_start_ts_checkpoint(task_name).await?, + }; + Ok(cp) + } + + /// get the global progress (the min next_backup_ts among all stores). + pub async fn global_progress_of_task(&self, task_name: &str) -> Result { + let cp = self.global_checkpoint_of_task(task_name).await?; + debug!("getting global progress of task"; "checkpoint" => ?cp); + let ts = cp.ts.into_inner(); + Ok(ts) } /// insert a task with ranges into the metadata store. - /// the current abstraction of metadata store doesn't support transaction API. - /// Hence this function is non-transactional and only for testing. + /// the current abstraction of metadata store doesn't support transaction + /// API. Hence this function is non-transactional and only for testing. pub async fn insert_task_with_range( &self, task: &StreamTask, @@ -459,9 +606,153 @@ impl MetadataClient { /// remove some task, without the ranges. /// only for testing. + #[cfg(test)] pub async fn remove_task(&self, name: &str) -> Result<()> { self.meta_store .delete(Keys::Key(MetaKey::task_of(name))) .await } + + pub async fn global_checkpoint_of(&self, task: &str) -> Result> { + let cps = self.checkpoints_of(task).await?; + let mut min_checkpoint = None; + for cp in cps { + match cp.provider { + CheckpointProvider::Store(..) => { + if min_checkpoint + .as_ref() + .map(|c: &Checkpoint| c.ts > cp.ts) + .unwrap_or(true) + { + min_checkpoint = Some(cp); + } + } + // The global checkpoint has higher priority than store checkpoint. + CheckpointProvider::Task | CheckpointProvider::Global => return Ok(Some(cp)), + CheckpointProvider::Region { .. } => continue, + } + } + Ok(min_checkpoint) + } + + fn cached_checkpoint(&self, task: &str) -> Option { + self.caches + .get(task) + .and_then(|x| x.value().get()) + .map(|x| Checkpoint { + provider: CheckpointProvider::Global, + ts: x, + }) + } + + fn update_cache(&self, task: &str, checkpoint: TimeStamp) { + let mut c = self.caches.entry(task.to_owned()).or_default(); + c.value_mut().update(checkpoint); + } + + pub async fn get_region_checkpoint(&self, task: &str, region: &Region) -> Result { + if let Some(c) = self.cached_checkpoint(task) { + return Ok(c); + } + let key = MetaKey::next_bakcup_ts_of_region(task, region); + let r = self + .meta_store + .get_latest(Keys::Key(key.clone())) + .await? + .inner; + let cp = match r.len() { + 0 => { + let global_cp = self.global_checkpoint_of(task).await?; + let cp = match global_cp { + None => self.get_task_start_ts_checkpoint(task).await?, + Some(cp) => cp, + }; + cp + } + _ => Checkpoint::from_kv(&r[0])?, + }; + self.update_cache(task, cp.ts); + Ok(cp) + } +} + +fn parse_ts_from_bytes(next_backup_ts: &[u8]) -> Result { + if next_backup_ts.len() != 8 { + return Err(Error::MalformedMetadata(format!( + "the length of next_backup_ts is {} bytes, require 8 bytes", + next_backup_ts.len() + ))); + } + let mut buf = [0u8; 8]; + buf.copy_from_slice(next_backup_ts); + Ok(u64::from_be_bytes(buf)) +} + +#[cfg(test)] +mod test { + use kvproto::metapb::{Region as RegionInfo, RegionEpoch}; + use txn_types::TimeStamp; + + use super::Checkpoint; + use crate::metadata::{ + client::CheckpointProvider, + keys::{KeyValue, MetaKey}, + }; + + #[test] + fn test_parse() { + struct Case { + provider: CheckpointProvider, + checkpoint: u64, + } + + fn run_case(c: Case) { + let key = match c.provider { + CheckpointProvider::Region { id, version } => { + let mut r = RegionInfo::new(); + let mut v = RegionEpoch::new(); + v.set_version(version); + r.set_region_epoch(v); + r.set_id(id); + MetaKey::next_bakcup_ts_of_region("test", &r) + } + CheckpointProvider::Store(id) => MetaKey::next_backup_ts_of("test", id), + _ => unreachable!(), + }; + let checkpoint = c.checkpoint; + let cp_bytes = checkpoint.to_be_bytes(); + let kv = KeyValue(key, cp_bytes.to_vec()); + let parsed = Checkpoint::from_kv(&kv).unwrap(); + assert_eq!( + parsed, + Checkpoint { + provider: c.provider, + ts: TimeStamp::new(c.checkpoint), + } + ); + } + use CheckpointProvider::*; + + let cases = vec![ + Case { + checkpoint: TimeStamp::compose(TimeStamp::physical_now(), 10).into_inner(), + provider: Region { id: 42, version: 8 }, + }, + Case { + checkpoint: u64::from_be_bytes(*b"let i=0;"), + provider: Store(3), + }, + Case { + checkpoint: u64::from_be_bytes(*b"(callcc)"), + provider: Region { + id: 16961, + version: 16, + }, + }, + ]; + + for case in cases { + run_case(case) + } + } } diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index be92da123ae..87c0e036172 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -1,14 +1,16 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use bytes::BufMut; +use kvproto::metapb::Region; -const PREFIX: &str = "/tidb/br-stream"; +pub(super) const PREFIX: &str = "/tidb/br-stream"; const PATH_INFO: &str = "/info"; const PATH_NEXT_BACKUP_TS: &str = "/checkpoint"; +const PATH_STORAGE_CHECKPOINT: &str = "/storage-checkpoint"; const PATH_RANGES: &str = "/ranges"; const PATH_PAUSE: &str = "/pause"; const PATH_LAST_ERROR: &str = "/last-error"; -// Note: maybe use something like `const_fmt` for concatenating constant strings? +// Note: maybe use something like `const_fmt` for concatenating constant +// strings? const TASKS_PREFIX: &str = "/tidb/br-stream/info/"; /// A key that associates to some metadata. @@ -23,18 +25,29 @@ const TASKS_PREFIX: &str = "/tidb/br-stream/info/"; /// /checkpoint/// -> /// For the status of tasks: /// /pause/ -> "" +/// For the storage checkpoint ts of tasks: +/// /storage-checkpoint// -> /// ``` -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct MetaKey(pub Vec); /// A simple key value pair of metadata. -#[derive(Clone, Debug)] +#[derive(Clone, Eq, PartialEq)] pub struct KeyValue(pub MetaKey, pub Vec); +impl std::fmt::Debug for KeyValue { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("KV") + .field(&self.0) + .field(&format_args!("{}", self.1.escape_ascii())) + .finish() + } +} + impl std::fmt::Debug for MetaKey { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_tuple("MetaKey") - .field(&self.0.escape_ascii()) + f.debug_tuple("K") + .field(&format_args!("{}", self.0.escape_ascii())) .finish() } } @@ -57,7 +70,8 @@ impl KeyValue { } /// Take the start-key and end-key from a metadata key-value pair. - /// example: `KeyValue(/ranges/, ) -> (, )` + /// example: `KeyValue(/ranges/, ) -> + /// (, )` pub fn take_range(&mut self, task_name: &str) -> (Vec, Vec) { let prefix_len = MetaKey::ranges_prefix_len(task_name); (self.take_key()[prefix_len..].to_vec(), self.take_value()) @@ -99,19 +113,47 @@ impl MetaKey { ranges } - /// The key of next backup ts of some region in some store. - pub fn next_backup_ts_of(name: &str, store_id: u64) -> Self { - let base = Self::next_backup_ts(name); - let mut buf = bytes::BytesMut::from(base.0.as_slice()); - buf.put_u64(store_id); - Self(buf.to_vec()) - } - // The prefix for next backup ts. pub fn next_backup_ts(name: &str) -> Self { Self(format!("{}{}/{}/", PREFIX, PATH_NEXT_BACKUP_TS, name).into_bytes()) } + /// The key of next backup ts of some region in some store. + pub fn next_backup_ts_of(name: &str, store_id: u64) -> Self { + Self( + format!( + "{}{}/{}/store/{}", + PREFIX, PATH_NEXT_BACKUP_TS, name, store_id + ) + .into_bytes(), + ) + } + + pub fn next_bakcup_ts_of_region(name: &str, region: &Region) -> Self { + Self( + format!( + "{}{}/{}/region/{}/{}", + PREFIX, + PATH_NEXT_BACKUP_TS, + name, + region.id, + region.get_region_epoch().get_version() + ) + .into_bytes(), + ) + } + + /// defines the key of storage checkpoint-ts of task in a store. + pub fn storage_checkpoint_of(name: &str, store_id: u64) -> Self { + Self( + format!( + "{}{}/{}/{}", + PREFIX, PATH_STORAGE_CHECKPOINT, name, store_id + ) + .into_bytes(), + ) + } + pub fn pause_prefix_len() -> usize { Self::pause_prefix().0.len() } @@ -125,10 +167,18 @@ impl MetaKey { Self(format!("{}{}/{}", PREFIX, PATH_PAUSE, name).into_bytes()) } + pub fn last_errors_of(name: &str) -> Self { + Self(format!("{}{}/{}", PREFIX, PATH_LAST_ERROR, name).into_bytes()) + } + pub fn last_error_of(name: &str, store: u64) -> Self { Self(format!("{}{}/{}/{}", PREFIX, PATH_LAST_ERROR, name, store).into_bytes()) } + pub fn central_global_checkpoint_of(name: &str) -> Self { + Self(format!("{}/checkpoint/{}/central_global", PREFIX, name).into_bytes()) + } + /// return the key that keeps the range [self, self.next()) contains only /// `self`. pub fn next(&self) -> Self { @@ -140,16 +190,7 @@ impl MetaKey { /// return the key that keeps the range [self, self.next_prefix()) contains /// all keys with the prefix `self`. pub fn next_prefix(&self) -> Self { - let mut next_prefix = self.clone(); - for i in (0..next_prefix.0.len()).rev() { - if next_prefix.0[i] == u8::MAX { - next_prefix.0.pop(); - } else { - next_prefix.0[i] += 1; - break; - } - } - next_prefix + Self(tikv_util::codec::next_prefix_of(self.0.clone())) } } diff --git a/components/backup-stream/src/metadata/metrics.rs b/components/backup-stream/src/metadata/metrics.rs index f4ea1258ab7..1dea498834e 100644 --- a/components/backup-stream/src/metadata/metrics.rs +++ b/components/backup-stream/src/metadata/metrics.rs @@ -16,4 +16,10 @@ lazy_static! { "metadata event(task_add, task_removed, error) count.", &["type"], }.unwrap(); + + pub static ref METADATA_KEY_OPERATION: IntCounterVec = register_int_counter_vec! { + "tikv_log_backup_metadata_key_operation", + "the operation over keys", + &["type"], + }.unwrap(); } diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index a49eb305fa1..a96e2f9bcb6 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -1,10 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod checkpoint_cache; mod client; pub mod keys; mod metrics; pub mod store; -mod test; +pub mod test; -pub use client::{MetadataClient, MetadataEvent, StreamTask}; +pub use client::{Checkpoint, CheckpointProvider, MetadataClient, MetadataEvent, StreamTask}; +#[cfg(feature = "metastore-etcd")] pub use store::lazy_etcd::{ConnectionConfig, LazyEtcdClient}; diff --git a/components/backup-stream/src/metadata/store/etcd.rs b/components/backup-stream/src/metadata/store/etcd.rs index 7da46ea5dbf..62a246a08ef 100644 --- a/components/backup-stream/src/metadata/store/etcd.rs +++ b/components/backup-stream/src/metadata/store/etcd.rs @@ -1,21 +1,33 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{pin::Pin, sync::Arc}; +use std::{ + cmp::Ordering, + collections::{HashMap, HashSet}, + pin::Pin, + sync::{Arc, Weak}, + time::Duration, +}; use async_trait::async_trait; use etcd_client::{ - DeleteOptions, EventType, GetOptions, SortOrder, SortTarget, Txn, TxnOp, WatchOptions, + Client, Compare, CompareOp, DeleteOptions, EventType, GetOptions, Member, PutOptions, + SortOrder, SortTarget, Txn, TxnOp, WatchOptions, }; use futures::StreamExt; -use tikv_util::warn; +use tikv_util::{info, warn}; use tokio::sync::Mutex; use tokio_stream::Stream; -use super::{GetExtra, GetResponse, Keys, KvChangeSubscription, KvEventType, MetaStore, Snapshot}; +use super::{ + GetExtra, GetResponse, Keys, KvChangeSubscription, KvEventType, MetaStore, Snapshot, + TransactionOp, +}; use crate::{ - errors::Result, + annotate, + errors::{Error, EtcdErrorExt, Result}, metadata::{ keys::{KeyValue, MetaKey}, + metrics::METADATA_KEY_OPERATION, store::{KvEvent, Subscription}, }, }; @@ -24,6 +36,187 @@ use crate::{ #[derive(Clone)] pub struct EtcdStore(Arc>); +#[derive(Default)] +pub(super) struct TopologyUpdater { + last_urls: HashSet, + client: Weak>, + + // back off configs + pub(super) loop_interval: Duration, + pub(super) loop_failure_back_off: Duration, +} + +impl std::fmt::Debug for TopologyUpdater { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TopologyUpdater") + .field("last_urls", &self.last_urls) + .finish() + } +} + +#[async_trait] +pub(super) trait ClusterInfoProvider { + async fn get_members(&mut self) -> Result>; + async fn add_endpoint(&mut self, endpoint: &str) -> Result<()>; + async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()>; +} + +#[async_trait] +impl ClusterInfoProvider for Client { + async fn get_members(&mut self) -> Result> { + let result = self.member_list().await?; + Ok(result.members().to_vec()) + } + + async fn add_endpoint(&mut self, endpoint: &str) -> Result<()> { + Client::add_endpoint(self, endpoint) + .await + .map_err(|err| annotate!(err, "during adding the endpoint {}", endpoint))?; + Ok(()) + } + + async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()> { + Client::remove_endpoint(self, endpoint) + .await + .map_err(|err| annotate!(err, "during removing the endpoint {}", endpoint))?; + Ok(()) + } +} + +#[derive(Debug, Clone, Copy)] +enum DiffType { + Add, + Remove, +} + +#[derive(Clone)] +struct Diff { + diff_type: DiffType, + url: String, +} + +impl std::fmt::Debug for Diff { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let syn = match self.diff_type { + DiffType::Add => "+", + DiffType::Remove => "-", + }; + write!(f, "{}{}", syn, self.url) + } +} + +impl TopologyUpdater { + // Note: we may require the initial endpoints from the arguments directly. + // So the internal map won't get inconsistent when the cluster config changed + // during initializing. + // But that is impossible for now because we cannot query the node ID before + // connecting. + pub fn new(cluster_ref: Weak>) -> Self { + Self { + last_urls: Default::default(), + client: cluster_ref, + + loop_interval: Duration::from_secs(60), + loop_failure_back_off: Duration::from_secs(10), + } + } + + pub fn init(&mut self, members: impl Iterator) { + for mem in members { + self.last_urls.insert(mem); + } + } + + fn diff(&self, incoming: &[Member]) -> Vec { + let newer = incoming + .iter() + .flat_map(|mem| mem.client_urls().iter()) + .collect::>(); + let mut result = vec![]; + for url in &newer { + if !self.last_urls.contains(*url) { + result.push(Diff { + diff_type: DiffType::Add, + url: String::clone(url), + }) + } + } + for url in &self.last_urls { + if !newer.contains(url) { + result.push(Diff { + diff_type: DiffType::Remove, + url: String::clone(url), + }) + } + } + result + } + + fn apply(&mut self, diff: &Diff) -> Option { + match diff.diff_type { + DiffType::Add => match self.last_urls.insert(diff.url.clone()) { + true => None, + false => Some(format!( + "the member to adding with url {} overrides existing urls.", + diff.url + )), + }, + DiffType::Remove => match self.last_urls.remove(&diff.url) { + true => None, + false => Some(format!( + "the member to remove with url {} hasn't been added.", + diff.url + )), + }, + } + } + + async fn update_topology_by(&mut self, cli: &mut C, diff: &Diff) -> Result<()> { + match diff.diff_type { + DiffType::Add => cli.add_endpoint(&diff.url).await?, + DiffType::Remove => cli.remove_endpoint(&diff.url).await?, + } + Ok(()) + } + + async fn do_update(&mut self, cli: &mut C) -> Result<()> { + let cluster = cli.get_members().await?; + let diffs = self.diff(cluster.as_slice()); + if !diffs.is_empty() { + info!("log backup updating store topology."; "diffs" => ?diffs, "current_state" => ?self); + } + for diff in diffs { + match self.apply(&diff) { + Some(warning) => { + warn!("log backup meet some wrong status when updating PD clients, skipping this update."; "warn" => %warning); + } + None => self.update_topology_by(cli, &diff).await?, + } + } + Result::Ok(()) + } + + pub(super) async fn update_topology_loop(&mut self) { + while let Some(cli) = self.client.upgrade() { + let mut lock = cli.lock().await; + let result = self.do_update(&mut lock).await; + drop(lock); + match result { + Ok(_) => tokio::time::sleep(self.loop_interval).await, + Err(err) => { + err.report("during updating etcd topology"); + tokio::time::sleep(self.loop_failure_back_off).await; + } + } + } + } + + pub async fn main_loop(mut self) { + info!("log backup topology updater finish initialization."; "current_state" => ?self); + self.update_topology_loop().await + } +} + impl EtcdStore { pub fn connect, S: AsRef<[E]>>(endpoints: S) -> Self { // TODO remove block_on @@ -31,6 +224,10 @@ impl EtcdStore { futures::executor::block_on(etcd_client::Client::connect(&endpoints, None)).unwrap(); Self(Arc::new(Mutex::new(cli))) } + + pub fn inner(&self) -> &Arc> { + &self.0 + } } impl From for EtcdStore { @@ -50,13 +247,14 @@ impl From for KvEventType { impl From for KeyValue { fn from(kv: etcd_client::KeyValue) -> Self { - // TODO: we can move out the vector in the KeyValue struct here. (instead of copying.) - // But that isn't possible for now because: + // TODO: we can move out the vector in the KeyValue struct here. (instead of + // copying.) But that isn't possible for now because: // - The raw KV pair(defined by the protocol buffer of etcd) is private. - // - That did could be exported by `pub-fields` feature of the client. - // However that feature isn't published in theirs Cargo.toml (Is that a mistake?). - // - Indeed, we can use `mem::transmute` here because `etcd_client::KeyValue` has `#[repr(transparent)]`. - // But before here become a known bottle neck, I'm not sure whether it's worthwhile for involving unsafe code. + // - That did could be exported by `pub-fields` feature of the client. However + // that feature isn't published in theirs Cargo.toml (Is that a mistake?). + // - Indeed, we can use `mem::transmute` here because `etcd_client::KeyValue` + // has `#[repr(transparent)]`. But before here become a known bottle neck, I'm + // not sure whether it's worthwhile for involving unsafe code. KeyValue(MetaKey(kv.key().to_owned()), kv.value().to_owned()) } } @@ -64,7 +262,7 @@ impl From for KeyValue { /// Prepare the etcd options required by the keys. /// Return the start key for requesting. macro_rules! prepare_opt { - ($opt: ident, $keys: expr) => { + ($opt:ident, $keys:expr) => { match $keys { Keys::Prefix(key) => { $opt = $opt.with_prefix(); @@ -91,11 +289,6 @@ impl MetaStore for EtcdStore { }) } - async fn set(&self, pair: KeyValue) -> Result<()> { - self.0.lock().await.put(pair.0, pair.1, None).await?; - Ok(()) - } - async fn watch(&self, keys: Keys, start_rev: i64) -> Result { let mut opt = WatchOptions::new(); let key = prepare_opt!(opt, keys); @@ -106,17 +299,32 @@ impl MetaStore for EtcdStore { |events| -> Pin> + Send>> { match events { Err(err) => Box::pin(tokio_stream::once(Err(err.into()))), - Ok(events) => Box::pin(tokio_stream::iter( - // TODO: remove the copy here via access the protobuf field directly. - #[allow(clippy::unnecessary_to_owned)] - events.events().to_owned().into_iter().filter_map(|event| { - let kv = event.kv()?; - Some(Ok(KvEvent { - kind: event.event_type().into(), - pair: kv.clone().into(), - })) - }), - )), + Ok(events) => { + if events.compact_revision() > 0 && events.canceled() { + return Box::pin(tokio_stream::once(Err(Error::Etcd( + EtcdErrorExt::RevisionCompacted { + current: events.compact_revision(), + }, + )))); + } + if events.canceled() { + return Box::pin(tokio_stream::once(Err(Error::Etcd( + EtcdErrorExt::WatchCanceled, + )))); + } + Box::pin(tokio_stream::iter( + // TODO: remove the copy here via access the protobuf field + // directly. + #[allow(clippy::unnecessary_to_owned)] + events.events().to_owned().into_iter().filter_map(|event| { + let kv = event.kv()?; + Some(Ok(KvEvent { + kind: event.event_type().into(), + pair: kv.clone().into(), + })) + }), + )) + } } }, )), @@ -128,6 +336,20 @@ impl MetaStore for EtcdStore { }) } + async fn txn(&self, t: super::Transaction) -> Result<()> { + let mut cli = self.0.lock().await; + let txns = Self::make_txn(&mut cli, t).await?; + for txn in txns { + cli.txn(txn).await?; + } + Ok(()) + } + + async fn set(&self, pair: KeyValue) -> Result<()> { + self.0.lock().await.put(pair.0, pair.1, None).await?; + Ok(()) + } + async fn delete(&self, keys: Keys) -> Result<()> { let mut opt = DeleteOptions::new(); let key = prepare_opt!(opt, keys); @@ -136,31 +358,115 @@ impl MetaStore for EtcdStore { Ok(()) } - async fn txn(&self, t: super::Transaction) -> Result<()> { - self.0.lock().await.txn(t.into()).await?; + async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { + let mut cli = self.0.lock().await; + let txn = Self::make_conditional_txn(&mut cli, txn).await?; + cli.txn(txn).await?; Ok(()) } } -impl From for Txn { - fn from(etcd_txn: super::Transaction) -> Txn { - let txn = Txn::default(); - txn.and_then( - etcd_txn - .into_ops() - .into_iter() - .map(|op| match op { - super::TransactionOp::Put(mut pair) => { - TxnOp::put(pair.take_key(), pair.take_value(), None) - } - super::TransactionOp::Delete(rng) => { - let mut opt = DeleteOptions::new(); - let key = prepare_opt!(opt, rng); - TxnOp::delete(key, Some(opt)) - } - }) - .collect::>(), - ) +impl EtcdStore { + fn collect_leases_needed(txn: &super::Transaction) -> HashSet { + txn.ops + .iter() + .filter_map(|op| match op { + TransactionOp::Put(_, opt) if opt.ttl.as_secs() > 0 => Some(opt.ttl), + _ => None, + }) + .collect() + } + + async fn make_leases( + cli: &mut Client, + needed: HashSet, + ) -> Result> { + let mut map = HashMap::with_capacity(needed.len()); + for lease_time in needed { + let lease_id = cli.lease_grant(lease_time.as_secs() as _, None).await?.id(); + map.insert(lease_time, lease_id); + } + Ok(map) + } + + fn partition_txns(mut txn: super::Transaction, leases: HashMap) -> Vec { + txn.ops + .chunks_mut(128) + .map(|txn| Txn::default().and_then(Self::to_txn(txn, &leases))) + .collect() + } + + fn to_compare(cond: super::Condition) -> Compare { + let op = match cond.result { + Ordering::Less => CompareOp::Less, + Ordering::Equal => CompareOp::Equal, + Ordering::Greater => CompareOp::Greater, + }; + Compare::value(cond.over_key, op, cond.arg) + } + + /// Convert the transaction operations to etcd transaction ops. + fn to_txn(ops: &mut [super::TransactionOp], leases: &HashMap) -> Vec { + ops.iter_mut().map(|op| match op { + TransactionOp::Put(key, opt) => { + let opts = if opt.ttl.as_secs() > 0 { + let lease = leases.get(&opt.ttl); + match lease { + None => { + warn!("lease not found, the request key may not have a ttl"; "dur" => ?opt.ttl); + None + } + Some(lease_id) => { + Some(PutOptions::new().with_lease(*lease_id)) + } + } + } else { + None + }; + TxnOp::put(key.take_key(), key.take_value(), opts) + }, + TransactionOp::Delete(rng) => { + let rng = std::mem::replace(rng, Keys::Key(MetaKey(vec![]))); + let mut opt = DeleteOptions::new(); + let key = prepare_opt!(opt, rng); + TxnOp::delete(key, Some(opt)) + }, + }).collect::>() + } + + /// Make a conditional txn. + /// For now, this wouldn't split huge transaction into smaller ones, + /// so when playing with etcd in PD, conditional transaction should be + /// small. + async fn make_conditional_txn( + cli: &mut Client, + mut txn: super::CondTransaction, + ) -> Result { + let cond = Self::to_compare(txn.cond); + + let mut leases_needed = Self::collect_leases_needed(&txn.success); + leases_needed.extend(Self::collect_leases_needed(&txn.failure).into_iter()); + let leases = Self::make_leases(cli, leases_needed).await?; + let success = Self::to_txn(&mut txn.success.ops, &leases); + let failure = Self::to_txn(&mut txn.failure.ops, &leases); + Ok(Txn::new().when([cond]).and_then(success).or_else(failure)) + } + + async fn make_txn(cli: &mut Client, etcd_txn: super::Transaction) -> Result> { + let (put_cnt, delete_cnt) = etcd_txn.ops.iter().fold((0, 0), |(p, d), item| match item { + TransactionOp::Put(..) => (p + 1, d), + TransactionOp::Delete(_) => (p, d + 1), + }); + METADATA_KEY_OPERATION + .with_label_values(&["put"]) + .inc_by(put_cnt); + METADATA_KEY_OPERATION + .with_label_values(&["del"]) + .inc_by(delete_cnt); + let needed_leases = Self::collect_leases_needed(&etcd_txn); + let leases = Self::make_leases(cli, needed_leases).await?; + let txns = Self::partition_txns(etcd_txn, leases); + Ok(txns) } } @@ -196,3 +502,126 @@ impl Snapshot for EtcdSnapshot { self.revision } } + +#[cfg(test)] +mod test { + use std::{ + collections::{HashMap, HashSet}, + fmt::Display, + sync::Arc, + time::Duration, + }; + + use async_trait::async_trait; + use etcd_client::{proto::PbMember, Member}; + use tokio::{sync::Mutex, time::timeout}; + + use super::{ClusterInfoProvider, TopologyUpdater}; + use crate::errors::Result; + + #[derive(Default, Debug)] + struct FakeCluster { + id_alloc: u64, + members: HashMap, + endpoints: HashSet, + } + + #[async_trait] + impl ClusterInfoProvider for FakeCluster { + async fn get_members(&mut self) -> Result> { + let members = self.members.values().cloned().collect(); + Ok(members) + } + + async fn add_endpoint(&mut self, endpoint: &str) -> Result<()> { + self.endpoints.insert(endpoint.to_owned()); + Ok(()) + } + + async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()> { + self.endpoints.remove(endpoint); + Ok(()) + } + } + + impl FakeCluster { + fn new_id(&mut self) -> u64 { + let i = self.id_alloc; + self.id_alloc += 1; + i + } + + fn init_with_member(&mut self, n: usize) -> Vec { + let mut endpoints = Vec::with_capacity(n); + for _ in 0..n { + let mem = self.add_member(); + let url = format!("fakestore://{}", mem); + self.endpoints.insert(url.clone()); + endpoints.push(url); + } + endpoints + } + + fn add_member(&mut self) -> u64 { + let id = self.new_id(); + let mut mem = PbMember::default(); + mem.id = id; + mem.client_ur_ls = vec![format!("fakestore://{}", id)]; + // Safety: `Member` is #[repr(transparent)]. + self.members.insert(id, unsafe { std::mem::transmute(mem) }); + id + } + + fn remove_member(&mut self, id: u64) -> bool { + self.members.remove(&id).is_some() + } + + fn check_consistency(&self, message: impl Display) { + let urls = self + .members + .values() + .flat_map(|mem| mem.client_urls().iter().cloned()) + .collect::>(); + assert_eq!( + urls, self.endpoints, + "{}: consistency check not passed.", + message + ); + } + } + + #[test] + fn test_topology_updater() { + let mut c = FakeCluster::default(); + let eps = c.init_with_member(3); + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let sc = Arc::new(Mutex::new(c)); + let mut tu = TopologyUpdater::new(Arc::downgrade(&sc)); + tu.loop_failure_back_off = Duration::ZERO; + tu.loop_interval = Duration::from_millis(100); + tu.init(eps.into_iter()); + + { + let mut sc = sc.blocking_lock(); + sc.check_consistency("after init"); + sc.add_member(); + rt.block_on(tu.do_update(&mut sc)).unwrap(); + sc.check_consistency("adding nodes"); + sc.add_member(); + sc.add_member(); + rt.block_on(tu.do_update(&mut sc)).unwrap(); + sc.check_consistency("adding more nodes"); + assert!(sc.remove_member(0), "{:?}", sc); + rt.block_on(tu.do_update(&mut sc)).unwrap(); + sc.check_consistency("removing nodes"); + } + + drop(sc); + rt.block_on(async { timeout(Duration::from_secs(1), tu.update_topology_loop()).await }) + .unwrap() + } +} diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 61145455419..3b697dae9b9 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -1,60 +1,142 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, SystemTime}, +}; -use etcd_client::{ConnectOptions, Error as EtcdError, TlsOptions}; +use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; -use tikv_util::stream::RetryError; -use tokio::sync::OnceCell; +use openssl::{ + pkey::PKey, + x509::{verify::X509VerifyFlags, X509}, +}; +use security::SecurityManager; +use tikv_util::{ + info, + stream::{RetryError, RetryExt}, + warn, +}; +use tokio::sync::Mutex as AsyncMutex; -use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; +use super::{ + etcd::{EtcdSnapshot, TopologyUpdater}, + EtcdStore, MetaStore, +}; use crate::errors::{ContextualResultExt, Result}; +const RPC_TIMEOUT: Duration = Duration::from_secs(30); + #[derive(Clone)] -pub struct LazyEtcdClient(Arc); +pub struct LazyEtcdClient(Arc>); +#[derive(Clone)] pub struct ConnectionConfig { - pub tls: Option, + pub tls: Arc, pub keep_alive_interval: Duration, pub keep_alive_timeout: Duration, } +impl Default for ConnectionConfig { + fn default() -> Self { + Self { + tls: Default::default(), + keep_alive_interval: Duration::from_secs(10), + keep_alive_timeout: Duration::from_secs(3), + } + } +} + +impl std::fmt::Debug for ConnectionConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConnectionConfig") + .field("keep_alive_interval", &self.keep_alive_interval) + .field("keep_alive_timeout", &self.keep_alive_timeout) + .finish() + } +} + impl ConnectionConfig { /// Convert the config to the connection option. fn to_connection_options(&self) -> ConnectOptions { let mut opts = ConnectOptions::new(); - if let Some(tls) = &self.tls { - opts = opts.with_tls(tls.clone()) + if let Some(tls) = &self + .tls + .client_suite() + .map_err(|err| warn!("failed to load client suite!"; "err" => %err)) + .ok() + { + opts = opts.with_openssl_tls( + OpenSslClientConfig::default() + .ca_cert_pem(&tls.ca) + // Some of users may prefer using multi-level self-signed certs. + // In this scenario, we must set this flag or openssl would probably complain it cannot found the root CA. + // (Because the flags we provide allows users providing exactly one CA cert.) + // We haven't make it configurable because it is enabled in gRPC by default too. + // TODO: Perhaps implement grpc-io based etcd client, fully remove the difference between gRPC TLS and our custom TLS? + .manually(|c| c.cert_store_mut().set_flags(X509VerifyFlags::PARTIAL_CHAIN)) + .manually(|c| { + let mut client_certs= X509::stack_from_pem(&tls.client_cert)?; + let client_key = PKey::private_key_from_pem(&tls.client_key.0)?; + if !client_certs.is_empty() { + c.set_certificate(&client_certs[0])?; + } + if client_certs.len() > 1 { + for i in client_certs.drain(1..) { + c.add_extra_chain_cert(i)?; + } + } + c.set_private_key(&client_key)?; + Ok(()) + }), + ) } - opts = opts.with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout); + opts = opts + .with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout) + .with_keep_alive_while_idle(false) + .with_timeout(RPC_TIMEOUT); + opts } } impl LazyEtcdClient { pub fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - Self(Arc::new(LazyEtcdClientInner { - opt: conf.to_connection_options(), - endpoints: endpoints.iter().map(ToString::to_string).collect(), - cli: OnceCell::new(), - })) + let mut inner = LazyEtcdClientInner::new(endpoints, conf); + inner.normalize_urls(); + Self(Arc::new(AsyncMutex::new(inner))) } -} -impl std::ops::Deref for LazyEtcdClient { - type Target = LazyEtcdClientInner; + // For testing -- check whether the endpoints are properly normalized. + #[cfg(test)] + pub(super) fn endpoints(&self) -> Vec { + self.0.blocking_lock().endpoints.clone() + } - fn deref(&self) -> &Self::Target { - Arc::deref(&self.0) + async fn get_cli(&self) -> Result { + let mut l = self.0.lock().await; + l.get_cli().await.cloned() } } #[derive(Clone)] pub struct LazyEtcdClientInner { - opt: ConnectOptions, + conf: ConnectionConfig, endpoints: Vec, - cli: OnceCell, + last_modified: Option, + cli: Option, +} + +impl LazyEtcdClientInner { + fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { + LazyEtcdClientInner { + conf, + endpoints: endpoints.iter().map(ToString::to_string).collect(), + last_modified: None, + cli: None, + } + } } fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { @@ -62,7 +144,9 @@ fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { EtcdError::InvalidArgs(_) | EtcdError::InvalidUri(_) | EtcdError::Utf8Error(_) - | EtcdError::InvalidHeaderValue(_) => false, + | EtcdError::InvalidHeaderValue(_) + | EtcdError::EndpointError(_) + | EtcdError::OpenSsl(_) => false, EtcdError::TransportError(_) | EtcdError::IoError(_) | EtcdError::WatchError(_) @@ -78,6 +162,7 @@ fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { } } +#[derive(Debug)] struct RetryableEtcdError(EtcdError); impl RetryError for RetryableEtcdError { @@ -97,27 +182,61 @@ where F: Future>, { use futures::TryFutureExt; - let r = tikv_util::stream::retry(move || action().err_into::()).await; + let r = tikv_util::stream::retry_ext( + move || action().err_into::(), + RetryExt::default().with_fail_hook(|err| info!("retry it"; "err" => ?err)), + ) + .await; r.map_err(|err| err.0.into()) } impl LazyEtcdClientInner { - async fn connect(&self) -> Result { + fn normalize_urls(&mut self) { + let enabled_tls = self.conf.tls.client_suite().is_ok(); + for endpoint in self.endpoints.iter_mut() { + // Don't touch them when the schemes already provided. + // Given etcd is based on gRPC (which relies on HTTP/2), + // there shouldn't be other schemes available (Hopefully...) + if endpoint.starts_with("http://") || endpoint.starts_with("https://") { + continue; + } + let expected_scheme = if enabled_tls { "https" } else { "http" }; + *endpoint = format!("{}://{}", expected_scheme, endpoint) + } + info!("log backup normalized etcd endpoints"; "endpoints" => ?self.endpoints); + } + + async fn connect(&mut self) -> Result<&EtcdStore> { let store = retry(|| { // For now, the interface of the `etcd_client` doesn't us to control - // how to create channels when connecting, hence we cannot update the tls config at runtime. - // TODO: maybe add some method like `with_channel` for `etcd_client`, and adapt the `SecurityManager` API, - // instead of doing everything by own. - etcd_client::Client::connect(self.endpoints.clone(), Some(self.opt.clone())) + // how to create channels when connecting, hence we cannot update the tls config + // at runtime, now what we did is manually check that each time we are getting + // the clients. + etcd_client::Client::connect( + self.endpoints.clone(), + Some(self.conf.to_connection_options()), + ) }) .await .context("during connecting to the etcd")?; - Ok(EtcdStore::from(store)) + let store = EtcdStore::from(store); + let mut updater = TopologyUpdater::new(Arc::downgrade(store.inner())); + self.cli = Some(store); + updater.init(self.endpoints.iter().cloned()); + tokio::task::spawn(updater.main_loop()); + Ok(self.cli.as_ref().unwrap()) } - pub async fn get_cli(&self) -> Result<&EtcdStore> { - let store = self.cli.get_or_try_init(|| self.connect()).await?; - Ok(store) + pub async fn get_cli(&mut self) -> Result<&EtcdStore> { + let modified = self.conf.tls.get_config().is_modified(&mut self.last_modified) + // Don't reload once we cannot check whether it is modified. + // Because when TLS disabled, this would always fail. + .unwrap_or(false); + if !modified && self.cli.is_some() { + return Ok(self.cli.as_ref().unwrap()); + } + info!("log backup reconnecting to the etcd service."; "tls_modified" => %modified, "connected_before" => %self.cli.is_some()); + self.connect().await } } @@ -126,7 +245,7 @@ impl MetaStore for LazyEtcdClient { type Snap = EtcdSnapshot; async fn snapshot(&self) -> Result { - self.0.get_cli().await?.snapshot().await + self.get_cli().await?.snapshot().await } async fn watch( @@ -134,10 +253,64 @@ impl MetaStore for LazyEtcdClient { keys: super::Keys, start_rev: i64, ) -> Result { - self.0.get_cli().await?.watch(keys, start_rev).await + self.get_cli().await?.watch(keys, start_rev).await } async fn txn(&self, txn: super::Transaction) -> Result<()> { - self.0.get_cli().await?.txn(txn).await + self.get_cli().await?.txn(txn).await + } + + async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { + self.get_cli().await?.txn_cond(txn).await + } +} + +#[cfg(test)] +mod tests { + use std::{fs::File, io::Write, path::PathBuf, sync::Arc}; + + use security::{SecurityConfig, SecurityManager}; + use tempfile::TempDir; + + use super::LazyEtcdClient; + use crate::{errors::Result, metadata::ConnectionConfig}; + + #[test] + fn test_normalize_url() -> Result<()> { + let endpoints = ["http://pd-1".to_owned(), "pd-2".to_owned()]; + let le = LazyEtcdClient::new(&endpoints, Default::default()); + assert_eq!(le.endpoints(), &["http://pd-1", "http://pd-2"]); + + let tempdir = TempDir::new()?; + let write_all = |path: &PathBuf, content| { + let mut f = File::create(path)?; + f.write_all(content)?; + Result::Ok(()) + }; + let ca = tempdir.path().join("ca"); + let cert = tempdir.path().join("cert"); + let key = tempdir.path().join("key"); + write_all(&ca, b"CA :3")?; + write_all(&cert, b"Cert :D")?; + write_all(&key, b"Key X)")?; + + let cfg = SecurityConfig { + ca_path: ca.to_string_lossy().into_owned(), + cert_path: cert.to_string_lossy().into_owned(), + key_path: key.to_string_lossy().into_owned(), + + ..Default::default() + }; + let sm = SecurityManager::new(&cfg).unwrap(); + let endpoints = ["https://pd-1".to_owned(), "pd-2".to_owned()]; + let le = LazyEtcdClient::new( + &endpoints, + ConnectionConfig { + tls: Arc::new(sm), + ..Default::default() + }, + ); + assert_eq!(le.endpoints(), &["https://pd-1", "https://pd-2"]); + Result::Ok(()) } } diff --git a/components/backup-stream/src/metadata/store/mod.rs b/components/backup-stream/src/metadata/store/mod.rs index 58441d7ba72..7cecda9720e 100644 --- a/components/backup-stream/src/metadata/store/mod.rs +++ b/components/backup-stream/src/metadata/store/mod.rs @@ -1,14 +1,25 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +cfg_if::cfg_if! { + if #[cfg(feature = "metastore-etcd")] { + pub mod etcd; + pub mod lazy_etcd; + pub use etcd::EtcdStore; + } +} + +// Note: these mods also used for integration tests, +// so we cannot compile them only when `#[cfg(test)]`. +// (See https://github.com/rust-lang/rust/issues/84629) +// Maybe we'd better make a feature like `integration-test`? pub mod slash_etc; pub use slash_etc::SlashEtcStore; -pub mod etcd; -pub mod lazy_etcd; -use std::{future::Future, pin::Pin}; +pub mod pd; + +use std::{cmp::Ordering, future::Future, pin::Pin, time::Duration}; use async_trait::async_trait; -pub use etcd::EtcdStore; use tokio_stream::Stream; // ==== Generic interface definition ==== @@ -17,31 +28,81 @@ use crate::errors::Result; pub type BoxStream = Pin + Send>>; pub type BoxFuture = Pin + Send>>; +pub use pd::PdStore; #[derive(Debug, Default)] pub struct Transaction { ops: Vec, } +/// A condition for executing a transcation. +/// Compare value a key with arg. +#[derive(Debug)] +pub struct Condition { + over_key: Vec, + result: Ordering, + arg: Vec, +} + +impl Condition { + pub fn new(over_key: MetaKey, result: Ordering, arg: Vec) -> Self { + Self { + over_key: over_key.0, + result, + arg, + } + } +} + +/// A conditional transaction. +/// This would atomically evaluate the condition, and execute corresponding +/// transaction. +#[derive(Debug)] +pub struct CondTransaction { + cond: Condition, + success: Transaction, + failure: Transaction, +} + +impl CondTransaction { + pub fn new(cond: Condition, success: Transaction, failure: Transaction) -> Self { + Self { + cond, + success, + failure, + } + } +} + impl Transaction { fn into_ops(self) -> Vec { self.ops } - fn put(mut self, kv: KeyValue) -> Self { - self.ops.push(TransactionOp::Put(kv)); + pub fn put(mut self, kv: KeyValue) -> Self { + self.ops.push(TransactionOp::Put(kv, PutOption::default())); + self + } + + pub fn put_opt(mut self, kv: KeyValue, opt: PutOption) -> Self { + self.ops.push(TransactionOp::Put(kv, opt)); self } - fn delete(mut self, keys: Keys) -> Self { + pub fn delete(mut self, keys: Keys) -> Self { self.ops.push(TransactionOp::Delete(keys)); self } } +#[derive(Default, Debug)] +pub struct PutOption { + pub ttl: Duration, +} + #[derive(Debug)] pub enum TransactionOp { - Put(KeyValue), + Put(KeyValue, PutOption), Delete(Keys), } @@ -54,10 +115,19 @@ pub struct WithRevision { pub inner: T, } +impl WithRevision { + pub fn map(self, f: impl FnOnce(T) -> R) -> WithRevision { + WithRevision { + revision: self.revision, + inner: f(self.inner), + } + } +} + /// The key set for getting. /// I guess there should be a `&[u8]` in meta key, /// but the etcd client requires Into> :( -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Keys { Prefix(MetaKey), Range(MetaKey, MetaKey), @@ -106,7 +176,7 @@ pub trait Snapshot: Send + Sync + 'static { } } -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq, Clone, Copy)] pub enum KvEventType { Put, Delete, @@ -140,8 +210,9 @@ pub trait MetaStore: Clone + Send + Sync { /// Can be canceled then by polling the `cancel` future in the Subscription. async fn watch(&self, keys: Keys, start_rev: i64) -> Result; /// Execute an atomic write (write batch) over the store. - /// Maybe support etcd-like compare operations? async fn txn(&self, txn: Transaction) -> Result<()>; + /// Execute an conditional transaction over the store. + async fn txn_cond(&self, txn: CondTransaction) -> Result<()>; /// Set a key in the store. /// Maybe rename it to `put` to keeping consistency with etcd? @@ -152,4 +223,13 @@ pub trait MetaStore: Clone + Send + Sync { async fn delete(&self, keys: Keys) -> Result<()> { self.txn(Transaction::default().delete(keys)).await } + /// Get the latest version of some keys. + async fn get_latest(&self, keys: Keys) -> Result>> { + let s = self.snapshot().await?; + let keys = s.get(keys).await?; + Ok(WithRevision { + revision: s.revision(), + inner: keys, + }) + } } diff --git a/components/backup-stream/src/metadata/store/pd.rs b/components/backup-stream/src/metadata/store/pd.rs new file mode 100644 index 00000000000..5b2e2b466e5 --- /dev/null +++ b/components/backup-stream/src/metadata/store/pd.rs @@ -0,0 +1,324 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::VecDeque, fmt::Display, pin::Pin, task::ready}; + +use async_trait::async_trait; +use futures::{stream, Stream}; +use kvproto::meta_storagepb::{self as mpb, WatchResponse}; +use pd_client::meta_storage::{Get, MetaStorageClient, Put, Watch}; +use pin_project::pin_project; +use tikv_util::{box_err, info}; + +use super::{ + GetResponse, Keys, KvChangeSubscription, KvEvent, KvEventType, MetaStore, Snapshot, + WithRevision, +}; +use crate::{ + debug, + errors::{Error, Result}, + metadata::keys::{KeyValue, MetaKey, PREFIX}, +}; + +fn convert_kv(mut kv: mpb::KeyValue) -> KeyValue { + let k = kv.take_key(); + let v = kv.take_value(); + KeyValue(MetaKey(k), v) +} + +#[derive(Clone)] +pub struct PdStore { + client: M, +} + +impl PdStore { + pub fn new(s: M) -> Self { + Self { client: s } + } +} + +fn unimplemented(name: impl Display) -> Error { + Error::Io(std::io::Error::new( + std::io::ErrorKind::Unsupported, + format!("the behavior {} hasn't been implemented yet.", name), + )) +} + +#[pin_project] +struct PdWatchStream { + #[pin] + inner: S, + buf: VecDeque, +} + +impl PdWatchStream { + /// Create a new Watch Stream from PD, with a function to cancel the stream. + fn new(inner: S) -> Self { + Self { + inner, + buf: Default::default(), + } + } +} + +impl>> Stream for PdWatchStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + loop { + let this = self.as_mut().project(); + let buf = this.buf; + if let Some(x) = buf.pop_front() { + return Some(Ok(x)).into(); + } + let resp = ready!(this.inner.poll_next(cx)); + match resp { + None => return None.into(), + Some(Err(err)) => return Some(Err(Error::Pd(err))).into(), + Some(Ok(mut x)) => { + if x.get_header().has_error() { + return Some(Err(Error::Other(box_err!( + "watch stream returns error: {:?}", + x.get_header().get_error() + )))) + .into(); + } + assert!(buf.is_empty()); + for mut e in x.take_events().into_iter() { + let ty = match e.get_type() { + kvproto::meta_storagepb::EventEventType::Put => KvEventType::Put, + kvproto::meta_storagepb::EventEventType::Delete => KvEventType::Delete, + }; + let kv = KvEvent { + kind: ty, + pair: convert_kv(e.take_kv()), + }; + buf.push_back(kv); + } + } + } + } + } +} + +#[async_trait] +impl Snapshot for RevOnly { + async fn get_extra(&self, _keys: Keys, _extra: super::GetExtra) -> Result { + Err(unimplemented("PdStore::snapshot::get")) + } + + fn revision(&self) -> i64 { + self.0 + } +} + +pub struct RevOnly(i64); + +#[async_trait] +impl< + St: Stream> + Send + 'static, + PD: MetaStorageClient + Clone, +> MetaStore for PdStore +{ + type Snap = RevOnly; + + async fn snapshot(&self) -> Result { + // hacking here: when we are doing point querying, the server won't return + // revision. So we are going to query a non-exist prefix here. + let rev = self + .client + .get(Get::of(PREFIX.as_bytes().to_vec()).prefixed().limit(0)) + .await? + .get_header() + .get_revision(); + info!("pd meta client getting snapshot."; "rev" => %rev); + Ok(RevOnly(rev)) + } + + async fn watch( + &self, + keys: super::Keys, + start_rev: i64, + ) -> Result { + info!("pd meta client creating watch stream."; "keys" => ?keys, "rev" => %start_rev); + match keys { + Keys::Prefix(k) => { + use futures::stream::StreamExt; + let stream = self + .client + .watch(Watch::of(k).prefixed().from_rev(start_rev)); + let (stream, cancel) = stream::abortable(PdWatchStream::new(stream)); + Ok(KvChangeSubscription { + stream: stream.boxed(), + cancel: Box::pin(async move { cancel.abort() }), + }) + } + _ => Err(unimplemented("watch distinct keys or range of keys")), + } + } + + async fn txn(&self, _txn: super::Transaction) -> Result<()> { + Err(unimplemented("PdStore::txn")) + } + + async fn txn_cond(&self, _txn: super::CondTransaction) -> Result<()> { + Err(unimplemented("PdStore::txn_cond")) + } + + async fn set(&self, mut kv: KeyValue) -> Result<()> { + debug!("pd meta client setting."; "pair" => ?kv); + self.client + .put(Put::of(kv.take_key(), kv.take_value())) + .await?; + Ok(()) + } + + async fn get_latest(&self, keys: Keys) -> Result>> { + let spec = match keys.clone() { + Keys::Prefix(p) => Get::of(p).prefixed(), + Keys::Key(k) => Get::of(k), + Keys::Range(s, e) => Get::of(s).range_to(e), + }; + // Note: we skipped check `more` here, because we haven't make pager. + let mut resp = self.client.get(spec).await?; + let inner = resp + .take_kvs() + .into_iter() + .map(convert_kv) + .collect::>(); + let revision = resp.get_header().get_revision(); + debug!("pd meta client getting."; "range" => ?keys, "rev" => %revision, "result" => ?inner); + Ok(WithRevision { inner, revision }) + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use futures::{Future, StreamExt}; + use pd_client::{ + meta_storage::{Checked, Source, Sourced}, + RpcClient, + }; + use test_pd::{mocker::MetaStorage, util::*, Server as PdServer}; + use tikv_util::config::ReadableDuration; + + use super::PdStore; + use crate::metadata::{ + keys::{KeyValue, MetaKey}, + store::{Keys, MetaStore}, + }; + + fn new_test_server_and_client( + factory: impl FnOnce(RpcClient) -> C, + ) -> (PdServer, PdStore) { + let server = PdServer::with_case(1, Arc::::default()); + let eps = server.bind_addrs(); + let client = + new_client_with_update_interval(eps, None, ReadableDuration(Duration::from_secs(99))); + (server, PdStore::new(factory(client))) + } + + fn w(f: impl Future) -> T { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(f) + } + + #[test] + fn test_query() { + let (_s, c) = new_test_server_and_client(|c| Sourced::new(Arc::new(c), Source::LogBackup)); + + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))).unwrap(); + insert("a", "the signpost of flowers"); + insert("b", "the milky hills"); + insert("c", "the rusty sky"); + + let k = w(c.get_latest(Keys::Key(MetaKey::task_of("a")))).unwrap(); + assert_eq!( + k.inner.as_slice(), + [kv("a", "the signpost of flowers")].as_slice() + ); + let k = w(c.get_latest(Keys::Key(MetaKey::task_of("d")))).unwrap(); + assert_eq!(k.inner.as_slice(), [].as_slice()); + + let k = w(c.get_latest(Keys::Prefix(MetaKey::tasks()))).unwrap(); + assert_eq!( + k.inner.as_slice(), + [ + kv("a", "the signpost of flowers"), + kv("b", "the milky hills"), + kv("c", "the rusty sky"), + ] + .as_slice() + ) + } + + #[test] + fn test_watch() { + let (_s, c) = new_test_server_and_client(|c| Sourced::new(Arc::new(c), Source::LogBackup)); + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))).unwrap(); + + insert("a", "the guest in vermilion"); + let res = w(c.get_latest(Keys::Prefix(MetaKey::tasks()))).unwrap(); + assert_eq!(res.inner.as_slice(), &[kv("a", "the guest in vermilion")]); + let mut ws = w(c.watch(Keys::Prefix(MetaKey::tasks()), res.revision + 1)).unwrap(); + let mut items = vec![]; + insert("a", "looking up at the ocean"); + items.push(w(ws.stream.next()).unwrap().unwrap()); + insert("b", "a folktale in the polar day"); + items.push(w(ws.stream.next()).unwrap().unwrap()); + w(ws.cancel); + assert!(w(ws.stream.next()).is_none()); + + assert_eq!(items[0].pair, kv("a", "looking up at the ocean")); + assert_eq!(items[1].pair, kv("b", "a folktale in the polar day")); + } + + #[test] + fn test_check_error() { + // Without AutoHeader, it will fail due to the source is empty. + let (_s, c) = new_test_server_and_client(|c| Checked::new(Arc::new(c))); + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))); + + insert("c", "the rainbow-like summer").unwrap_err(); + w(c.get_latest(Keys::Key(MetaKey(vec![42u8])))).unwrap_err(); + assert!(w(c.watch(Keys::Key(MetaKey(vec![42u8])), 42)).is_err()); + } + + #[test] + fn test_retry() { + use tikv_util::defer; + + defer! {{ + fail::remove("meta_storage_get"); + }}; + let (_s, c) = new_test_server_and_client(|c| Sourced::new(Arc::new(c), Source::LogBackup)); + + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))).unwrap(); + insert("rejectme", "this key would be rejected by the failpoint."); + + fail::cfg("meta_storage_get", "4*return").unwrap(); + let res = w(c.get_latest(Keys::Key(MetaKey::task_of("rejectme")))) + .expect("should success when temporary failing"); + assert_eq!(res.inner.len(), 1); + assert_eq!( + res.inner[0], + kv("rejectme", "this key would be rejected by the failpoint.") + ); + + // FIXME: this would take about 10s to run and influences unit tests run... + fail::cfg("meta_storage_get", "return").unwrap(); + w(c.get_latest(Keys::Key(MetaKey::task_of("rejectme")))) + .expect_err("should fail when ever failing"); + } +} diff --git a/components/backup-stream/src/metadata/store/slash_etc.rs b/components/backup-stream/src/metadata/store/slash_etc.rs index 48df7dbaaca..a564d069d14 100644 --- a/components/backup-stream/src/metadata/store/slash_etc.rs +++ b/components/backup-stream/src/metadata/store/slash_etc.rs @@ -8,14 +8,13 @@ use std::{ }; use async_trait::async_trait; -use slog_global::error; -use tikv_util::warn; use tokio::sync::{ mpsc::{self, Sender}, Mutex, }; use tokio_stream::StreamExt; +use super::{Condition, Keys}; use crate::{ errors::Result, metadata::{ @@ -33,11 +32,30 @@ struct Subscriber { tx: Sender, } +/// A key with revision. +#[derive(Default, Eq, PartialEq, Ord, PartialOrd, Clone)] +struct Key(Vec, i64); + +impl std::fmt::Debug for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Key") + .field(&format_args!("{}@{}", self.0.escape_ascii(), self.1)) + .finish() + } +} + +/// A value (maybe tombstone.) +#[derive(Debug, PartialEq, Clone)] +enum Value { + Val(Vec), + Del, +} + /// An in-memory, single versioned storage. /// Emulating some interfaces of etcd for testing. #[derive(Default)] pub struct SlashEtc { - items: BTreeMap, Vec>, + items: BTreeMap, // Maybe a range tree here if the test gets too slow. subs: HashMap, revision: i64, @@ -54,26 +72,15 @@ impl Snapshot for WithRevision { extra: crate::metadata::store::GetExtra, ) -> Result { let data = self.inner.lock().await; - if data.revision != self.revision { - warn!( - "snapshot expired (multi version isn't supported yet, you may read steal data): {} vs {}", - data.revision, self.revision - ); - } - let (start_key, end_key) = keys.into_bound(); - let mut kvs = data - .items - .range::<[u8], _>(( - Bound::Included(start_key.as_slice()), - Bound::Excluded(end_key.as_slice()), - )) - .map(|(k, v)| KeyValue(MetaKey(k.clone()), v.clone())) - .collect::>(); - // use iterator operations (instead of collect all kv pairs in the range) - // if the test case get too slow. (How can we figure out whether there are more?) + let mut kvs = data.get_key(keys); + if extra.desc_order { kvs.reverse(); } + + // use iterator operations (instead of collect all kv pairs in the range) + // if the test case get too slow. (How can we figure out whether there are + // more?) let more = if extra.limit > 0 { let more = kvs.len() > extra.limit; kvs.truncate(extra.limit); @@ -90,9 +97,37 @@ impl Snapshot for WithRevision { } impl SlashEtc { + fn alloc_rev(&mut self) -> i64 { + self.revision += 1; + self.revision + } + + fn get_key(&self, keys: super::Keys) -> Vec { + let (start_key, end_key) = keys.into_bound(); + let mvccs = self + .items + .range(( + Bound::Included(&Key(start_key, 0)), + Bound::Excluded(&Key(end_key, 0)), + )) + .collect::>(); + let kvs = mvccs + .as_slice() + .group_by(|k1, k2| k1.0.0 == k2.0.0) + .filter_map(|k| { + let (k, v) = k.last()?; + match v { + Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), + Value::Del => None, + } + }) + .collect::>(); + kvs + } + async fn set(&mut self, mut pair: crate::metadata::keys::KeyValue) -> Result<()> { let data = self; - data.revision += 1; + let rev = data.alloc_rev(); for sub in data.subs.values() { if pair.key() < sub.end_key.as_slice() && pair.key() >= sub.start_key.as_slice() { sub.tx @@ -104,33 +139,37 @@ impl SlashEtc { .unwrap(); } } - data.items.insert(pair.take_key(), pair.take_value()); + data.items + .insert(Key(pair.take_key(), rev), Value::Val(pair.take_value())); Ok(()) } async fn delete(&mut self, keys: crate::metadata::store::Keys) -> Result<()> { - let mut data = self; + let data = self; let (start_key, end_key) = keys.into_bound(); - data.revision += 1; - for mut victim in data + let rev = data.alloc_rev(); + let mut v = data .items - .range::<[u8], _>(( - Bound::Included(start_key.as_slice()), - Bound::Excluded(end_key.as_slice()), + .range(( + Bound::Included(Key(start_key, 0)), + Bound::Excluded(Key(end_key, data.revision)), )) - .map(|(k, _)| k.clone()) - .collect::>() - { - data.items.remove(&victim); + .map(|(k, _)| Key::clone(k)) + .collect::>(); + v.dedup_by(|k1, k2| k1.0 == k2.0); + + for mut victim in v { + let k = Key(victim.0.clone(), rev); + data.items.insert(k, Value::Del); for sub in data.subs.values() { - if victim.as_slice() < sub.end_key.as_slice() - && victim.as_slice() >= sub.start_key.as_slice() + if victim.0.as_slice() < sub.end_key.as_slice() + && victim.0.as_slice() >= sub.start_key.as_slice() { sub.tx .send(KvEvent { kind: KvEventType::Delete, - pair: KeyValue(MetaKey(std::mem::take(&mut victim)), vec![]), + pair: KeyValue(MetaKey(std::mem::take(&mut victim.0)), vec![]), }) .await .unwrap(); @@ -139,6 +178,16 @@ impl SlashEtc { } Ok(()) } + + /// A tool for dumpling the whole storage when test failed. + /// Add this to test code temporarily for debugging. + #[allow(dead_code)] + pub fn dump(&self) { + println!(">>>>>>> /etc (revision = {}) <<<<<<<", self.revision); + for (k, v) in self.items.iter() { + println!("{:?} => {:?}", k, v); + } + } } #[async_trait] @@ -158,17 +207,34 @@ impl MetaStore for SlashEtcStore { start_rev: i64, ) -> Result { let mut data = self.lock().await; - if start_rev != data.revision + 1 { - error!( - "start from arbitrary revision is not supported yet; only watch (current_rev + 1) supported. (self.revision = {}; start_rev = {})", - data.revision, start_rev - ); - } let id = data.sub_id_alloc.get(); data.sub_id_alloc.set(id + 1); let this = self.clone(); - let (tx, rx) = mpsc::channel(64); + let (tx, rx) = mpsc::channel(1024); let (start_key, end_key) = keys.into_bound(); + + // Sending events from [start_rev, now) to the client. + let mut pending = data + .items + .iter() + .filter(|(k, _)| k.1 >= start_rev) + .collect::>(); + pending.sort_by_key(|(k, _)| k.1); + for (k, v) in pending { + let event = match v { + Value::Val(val) => KvEvent { + kind: KvEventType::Put, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + Value::Del => KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(k.0.clone()), vec![]), + }, + }; + // Note: may panic if too many pending here? + tx.send(event).await.expect("too many pending events"); + } + data.subs.insert( id, Subscriber { @@ -190,10 +256,27 @@ impl MetaStore for SlashEtcStore { let mut data = self.lock().await; for op in txn.into_ops() { match op { - super::TransactionOp::Put(kv) => data.set(kv).await?, + super::TransactionOp::Put(kv, _) => data.set(kv).await?, super::TransactionOp::Delete(range) => data.delete(range).await?, } } Ok(()) } + + async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { + let l = self.lock().await; + let Condition { + over_key, + result, + arg, + } = txn.cond; + let success = l + .get_key(Keys::Key(MetaKey(over_key))) + .last() + .map(|k| k.0.0.cmp(&arg) == result) + .unwrap_or(false); + drop(l); + let do_txn = if success { txn.success } else { txn.failure }; + self.txn(do_txn).await + } } diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index bb5addd24a8..bb2b7fe1577 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -10,17 +10,17 @@ use std::{ use kvproto::brpb::{Noop, StorageBackend}; use tokio_stream::StreamExt; -use super::{MetadataClient, StreamTask}; +use super::{keys::MetaKey, MetadataClient, StreamTask}; use crate::{ errors::Result, metadata::{store::SlashEtcStore, MetadataEvent}, }; -fn test_meta_cli() -> MetadataClient { +pub fn test_meta_cli() -> MetadataClient { MetadataClient::new(SlashEtcStore::default(), 42) } -fn simple_task(name: &str) -> StreamTask { +pub fn simple_task(name: &str) -> StreamTask { let mut task = StreamTask::default(); task.info.set_name(name.to_owned()); task.info.set_start_ts(1); @@ -54,21 +54,7 @@ async fn test_basic() -> Result<()> { cli.insert_task_with_range(&task, ranges).await?; let remote_ranges = cli.ranges_of_task(name).await?.inner; assert_range_matches(remote_ranges, ranges); - let overlap_ranges = cli - .range_overlap_of_task(name, (b"7".to_vec(), b"9".to_vec())) - .await? - .inner; - assert_range_matches(overlap_ranges, &[(b"6", b"8"), (b"8", b"9")]); - let overlap_ranges = cli - .range_overlap_of_task(name, (b"1".to_vec(), b"5".to_vec())) - .await? - .inner; - assert_range_matches(overlap_ranges, &[(b"1", b"2"), (b"4", b"5")]); - let overlap_ranges = cli - .range_overlap_of_task(name, (b"1".to_vec(), b"4".to_vec())) - .await? - .inner; - assert_range_matches(overlap_ranges, &[(b"1", b"2")]); + Ok(()) } @@ -98,7 +84,7 @@ async fn test_watch() -> Result<()> { cli.insert_task_with_range(&task, &[]).await?; let initial_task_set = cli.get_tasks().await?; task_matches(initial_task_set.inner.as_slice(), &[task]); - let watcher = cli.events_from(initial_task_set.revision).await?; + let watcher = cli.events_from(initial_task_set.revision + 1).await?; let task2 = simple_task("simple_2"); cli.insert_task_with_range(&task2, &[]).await?; cli.remove_task("simple_1").await?; @@ -121,17 +107,65 @@ async fn test_progress() -> Result<()> { let cli = test_meta_cli(); let task = simple_task("simple_1"); cli.insert_task_with_range(&task, &[]).await?; - let progress = cli.progress_of_task(&task.info.name).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; assert_eq!(progress, task.info.start_ts); - cli.step_task(&task.info.name, 42).await?; - let progress = cli.progress_of_task(&task.info.name).await?; + cli.set_local_task_checkpoint(&task.info.name, 42).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; assert_eq!(progress, 42); - cli.step_task(&task.info.name, 43).await?; - let progress = cli.progress_of_task(&task.info.name).await?; + cli.set_local_task_checkpoint(&task.info.name, 43).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; assert_eq!(progress, 43); let other_store = MetadataClient::new(cli.meta_store.clone(), 43); - let progress = other_store.progress_of_task(&task.info.name).await?; - assert_eq!(progress, task.info.start_ts); + let progress = other_store + .get_local_task_checkpoint(&task.info.name) + .await?; + assert_eq!(progress.into_inner(), task.info.start_ts); + + Ok(()) +} + +#[test] +fn test_storage_checkpoint_of() { + let task_name = "simple_task"; + let store_id: u64 = 5; + let key = MetaKey::storage_checkpoint_of(task_name, store_id); + assert_eq!( + &key.0, + "/tidb/br-stream/storage-checkpoint/simple_task/5".as_bytes() + ); +} + +#[tokio::test] +async fn test_set_storage_checkpoint() -> Result<()> { + let cli = test_meta_cli(); + let task = simple_task("simple_3"); + let storage_checkpoint_ts: u64 = 12345; + + // set storage checkpoint to metadata + cli.set_storage_checkpoint(task.info.get_name(), storage_checkpoint_ts) + .await?; + // get storage checkpoint from metadata + let ts = cli.get_storage_checkpoint(task.info.get_name()).await?; + assert_eq!(ts.into_inner(), storage_checkpoint_ts); + Ok(()) +} + +#[tokio::test] +async fn test_init() -> Result<()> { + let cli = test_meta_cli(); + let mut task = simple_task("simple_2"); + cli.insert_task_with_range(&task, &[]).await?; + task.info.set_start_ts(42); + // Init task should set the checkpoint. + cli.init_task(&task.info).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; + assert_eq!(progress, 42); + cli.set_local_task_checkpoint(&task.info.name, 43).await?; + + // Init task again shouldn't roll back checkpoint. + cli.init_task(&task.info).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; + assert_eq!(progress, 43); Ok(()) } diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index 8ac5b30b000..225d583ca5c 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -2,11 +2,12 @@ use lazy_static::lazy_static; use prometheus::*; +use prometheus_static_metric::*; /// The status of a task. /// The ordering of this imples the priority for presenting to the user. /// max(TASK_STATUS) of all stores would be probably the state of the task. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum TaskStatus { Running = 0, Paused, @@ -25,6 +26,10 @@ pub fn update_task_status(status: TaskStatus, task: &str) { } } +pub fn remove_task_status_metric(task: &str) -> Result<()> { + TASK_STATUS.remove_label_values(&[task]) +} + lazy_static! { pub static ref INTERNAL_ACTOR_MESSAGE_HANDLE_DURATION: HistogramVec = register_histogram_vec!( "tikv_log_backup_interal_actor_acting_duration_sec", @@ -40,99 +45,103 @@ lazy_static! { ) .unwrap(); pub static ref HANDLE_EVENT_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( - "tikv_stream_event_handle_duration_sec", + "tikv_log_backup_event_handle_duration_sec", "The duration of handling an cmd batch.", &["stage"], exponential_buckets(0.001, 2.0, 16).unwrap() ) .unwrap(); pub static ref HANDLE_KV_HISTOGRAM: Histogram = register_histogram!( - "tikv_stream_handle_kv_batch", + "tikv_log_backup_handle_kv_batch", "The total kv pair change handle by the stream backup", exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); + pub static ref INCREMENTAL_SCAN_DISK_READ: Counter = register_counter!( + "tikv_log_backup_initial_scan_disk_read", + "The total count of disk read bytes." + ) + .unwrap(); pub static ref INCREMENTAL_SCAN_SIZE: Histogram = register_histogram!( - "tikv_stream_incremental_scan_bytes", + "tikv_log_backup_incremental_scan_bytes", "The size of scanning.", exponential_buckets(64.0, 2.0, 16).unwrap() ) .unwrap(); pub static ref SKIP_KV_COUNTER: Counter = register_counter!( - "tikv_stream_skip_kv_count", + "tikv_log_backup_skip_kv_count", "The total kv size skipped by the streaming", ) .unwrap(); - pub static ref STREAM_ERROR: CounterVec = register_counter_vec!( - "tikv_stream_errors", + pub static ref STREAM_ERROR: IntCounterVec = register_int_counter_vec!( + "tikv_log_backup_errors", "The errors during stream backup.", &["type"] ) .unwrap(); - pub static ref STREAM_FATAL_ERROR: CounterVec = register_counter_vec!( + pub static ref STREAM_FATAL_ERROR: IntCounterVec = register_int_counter_vec!( "tikv_log_backup_fatal_errors", "The errors during stream backup.", &["type"] ) .unwrap(); pub static ref HEAP_MEMORY: IntGauge = register_int_gauge!( - "tikv_stream_heap_memory", + "tikv_log_backup_heap_memory", "The heap memory allocating by stream backup." ) .unwrap(); pub static ref ON_EVENT_COST_HISTOGRAM: HistogramVec = register_histogram_vec!( - "tikv_stream_on_event_duration_seconds", + "tikv_log_backup_on_event_duration_seconds", "The time cost of handling events.", &["stage"], exponential_buckets(0.001, 2.0, 16).unwrap() ) .unwrap(); pub static ref STORE_CHECKPOINT_TS: IntGaugeVec = register_int_gauge_vec!( - "tikv_stream_store_checkpoint_ts", + "tikv_log_backup_store_checkpoint_ts", "The checkpoint ts (next backup ts) of task", &["task"], ) .unwrap(); pub static ref FLUSH_DURATION: HistogramVec = register_histogram_vec!( - "tikv_stream_flush_duration_sec", + "tikv_log_backup_flush_duration_sec", "The time cost of flushing a task.", &["stage"], exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); pub static ref FLUSH_FILE_SIZE: Histogram = register_histogram!( - "tikv_stream_flush_file_size", + "tikv_log_backup_flush_file_size", "Some statistics of flushing of this run.", exponential_buckets(1024.0, 2.0, 16).unwrap() ) .unwrap(); pub static ref INITIAL_SCAN_DURATION: Histogram = register_histogram!( - "tikv_stream_initial_scan_duration_sec", + "tikv_log_backup_initial_scan_duration_sec", "The duration of initial scanning.", exponential_buckets(0.001, 2.0, 16).unwrap() ) .unwrap(); pub static ref SKIP_RETRY: IntCounterVec = register_int_counter_vec!( - "tikv_stream_skip_retry_observe", + "tikv_log_backup_skip_retry_observe", "The reason of giving up observing region when meeting error.", &["reason"], ) .unwrap(); pub static ref INITIAL_SCAN_STAT: IntCounterVec = register_int_counter_vec!( - "tikv_stream_initial_scan_operations", + "tikv_log_backup_initial_scan_operations", "The operations over rocksdb during initial scanning.", &["cf", "op"], ) .unwrap(); pub static ref STREAM_ENABLED: IntCounter = register_int_counter!( - "tikv_stream_enabled", + "tikv_log_backup_enabled", "When gt 0, this node enabled streaming." ) .unwrap(); - pub static ref TRACK_REGION: IntCounterVec = register_int_counter_vec!( - "tikv_stream_observed_region", + pub static ref TRACK_REGION: IntGauge = register_int_gauge!( + "tikv_log_backup_observed_region", "the region being observed by the current store.", - &["type"], ) .unwrap(); static ref TASK_STATUS: IntGaugeVec = register_int_gauge_vec!( @@ -141,4 +150,37 @@ lazy_static! { &["task"] ) .unwrap(); + pub static ref PENDING_INITIAL_SCAN_LEN: IntGaugeVec = register_int_gauge_vec!( + "tikv_log_backup_pending_initial_scan", + "The pending initial scan", + &["stage"] + ) + .unwrap(); + pub static ref MISC_EVENTS: MiscEvents = register_static_int_counter_vec!( + MiscEvents, + "tikv_log_backup_misc_events", + "Events counter, including 'plain' events(i.e. events without extra information).", + &["name"] + ) + .unwrap(); + pub static ref MIN_TS_RESOLVE_DURATION: Histogram = register_histogram!( + "tikv_log_backup_resolve_duration_sec", + "The duration of resolving.", + exponential_buckets(0.001, 2.0, 16).unwrap() + ) + .unwrap(); +} + +make_static_metric! { + pub label_enum MiscEventsName { + skip_resolve_non_leader, + skip_resolve_no_subscription, + } + + pub struct MiscEvents: IntCounter { + "name" => { + skip_resolve_non_leader, + skip_resolve_no_subscription, + } + } } diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index 02c63f62a60..169c3b72268 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -1,9 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, RwLock, -}; +use std::sync::{Arc, RwLock}; use engine_traits::KvEngine; use kvproto::metapb::Region; @@ -18,18 +15,6 @@ use crate::{ utils::SegmentSet, }; -/// The inflight `StartObserve` message count. -/// Currently, we handle the `StartObserve` message in the main loop(endpoint thread), which may -/// take longer time than expected. So when we are starting to observe many region (e.g. failover), -/// there may be many pending messages, those messages won't block the advancing of checkpoint ts. -/// So the checkpoint ts may be too late and losing some data. -/// -/// This is a temporary solution for this problem: If this greater than (1), then it implies that there are some -/// inflight wait-for-initialized regions, we should block the resolved ts from advancing in that condition. -/// -/// FIXME: Move handler of `ModifyObserve` to another thread, and remove this :( -pub static IN_FLIGHT_START_OBSERVE_MESSAGE: AtomicUsize = AtomicUsize::new(0); - /// An Observer for Backup Stream. /// /// It observes raftstore internal events, such as: @@ -71,7 +56,6 @@ impl BackupStreamObserver { .scheduler .schedule(Task::ModifyObserve(ObserveOp::Start { region: region.clone(), - needs_initial_scanning: true, })) { use crate::errors::Error; @@ -95,13 +79,20 @@ impl BackupStreamObserver { .rl() .is_overlapping((region.get_start_key(), end_key)) } + + /// Check whether there are any task range registered to the observer. + /// when there isn't any task, we can ignore the events, so we don't need to + /// handle useless events. (Also won't yield verbose logs.) + pub fn is_hibernating(&self) -> bool { + self.ranges.rl().is_empty() + } } impl Coprocessor for BackupStreamObserver {} impl CmdObserver for BackupStreamObserver { - // `BackupStreamObserver::on_flush_applied_cmd_batch` should only invoke if `cmd_batches` is not empty - // and only leader will trigger this. + // `BackupStreamObserver::on_flush_applied_cmd_batch` should only invoke if + // `cmd_batches` is not empty and only leader will trigger this. fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -133,23 +124,19 @@ impl CmdObserver for BackupStreamObserver { fn on_applied_current_term(&self, role: StateRole, region: &Region) { if role == StateRole::Leader && self.should_register_region(region) { - let success = try_send!( + try_send!( self.scheduler, Task::ModifyObserve(ObserveOp::Start { region: region.clone(), - needs_initial_scanning: true, }) ); - if success { - IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_add(1, Ordering::SeqCst); - } } } } impl RoleObserver for BackupStreamObserver { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, r: &RoleChange) { - if r.state != StateRole::Leader { + if r.state != StateRole::Leader && !self.is_hibernating() { try_send!( self.scheduler, Task::ModifyObserve(ObserveOp::Stop { @@ -167,14 +154,14 @@ impl RegionChangeObserver for BackupStreamObserver { event: RegionChangeEvent, role: StateRole, ) { - if role != StateRole::Leader { + if role != StateRole::Leader || self.is_hibernating() { return; } match event { RegionChangeEvent::Destroy => { try_send!( self.scheduler, - Task::ModifyObserve(ObserveOp::CheckEpochAndStop { + Task::ModifyObserve(ObserveOp::Destroy { region: ctx.region().clone(), }) ); @@ -207,7 +194,7 @@ mod tests { use raft::StateRole; use raftstore::coprocessor::{ Cmd, CmdBatch, CmdObserveInfo, CmdObserver, ObserveHandle, ObserveLevel, ObserverContext, - RegionChangeEvent, RegionChangeObserver, RoleChange, RoleObserver, + RegionChangeEvent, RegionChangeObserver, RegionChangeReason, RoleChange, RoleObserver, }; use tikv_util::{worker::dummy_scheduler, HandyRwLock}; @@ -321,4 +308,23 @@ mod tests { Ok(Some(Task::ModifyObserve(ObserveOp::Stop { region, .. }))) if region.id == 42 ); } + + #[test] + fn test_hibernate() { + let (sched, mut rx) = dummy_scheduler(); + + // Prepare: assuming a task wants the range of [0001, 0010]. + let o = BackupStreamObserver::new(sched); + let r = fake_region(43, b"0010", b"0042"); + let mut ctx = ObserverContext::new(&r); + o.on_region_changed(&mut ctx, RegionChangeEvent::Create, StateRole::Leader); + o.on_region_changed( + &mut ctx, + RegionChangeEvent::Update(RegionChangeReason::Split), + StateRole::Leader, + ); + o.on_role_change(&mut ctx, &RoleChange::new(StateRole::Leader)); + let task = rx.recv_timeout(Duration::from_millis(20)); + assert!(task.is_err(), "it is {:?}", task); + } } diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 294ec2c0c98..4b1022e7b39 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use core::pin::Pin; use std::{ borrow::Borrow, collections::HashMap, @@ -8,7 +9,7 @@ use std::{ path::{Path, PathBuf}, result, sync::{ - atomic::{AtomicBool, AtomicPtr, AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicPtr, AtomicU64, AtomicUsize, Ordering}, Arc, RwLock as SyncRwLock, }, time::Duration, @@ -19,7 +20,10 @@ use external_storage::{BackendConfig, UnpinReader}; use external_storage_export::{create_storage, ExternalStorage}; use futures::io::Cursor; use kvproto::{ - brpb::{DataFileInfo, FileType, Metadata, StreamBackupTaskInfo}, + brpb::{ + CompressionType, DataFileGroup, DataFileInfo, FileType, MetaVersion, Metadata, + StreamBackupTaskInfo, + }, raft_cmdpb::CmdType, }; use openssl::hash::{Hasher, MessageDigest}; @@ -27,6 +31,7 @@ use protobuf::Message; use raftstore::coprocessor::CmdBatch; use slog_global::debug; use tidb_query_datatype::codec::table::decode_table_id; +use tikv::config::BackupStreamConfig; use tikv_util::{ box_err, codec::stream_event::EventEncoder, @@ -38,11 +43,11 @@ use tikv_util::{ }; use tokio::{ fs::{remove_file, File}, - io::{AsyncWriteExt, BufWriter}, + io::AsyncWriteExt, sync::{Mutex, RwLock}, }; use tokio_util::compat::TokioAsyncReadCompatExt; -use txn_types::{Key, Lock, TimeStamp}; +use txn_types::{Key, Lock, TimeStamp, WriteRef}; use super::errors::Result; use crate::{ @@ -53,11 +58,54 @@ use crate::{ metrics::{HANDLE_KV_HISTOGRAM, SKIP_KV_COUNTER}, subscription_track::TwoPhaseResolver, try_send, - utils::{self, SegmentMap, Slot, SlotMap, StopWatch}, + utils::{self, CompressionWriter, FilesReader, SegmentMap, SlotMap, StopWatch}, }; -pub const FLUSH_STORAGE_INTERVAL: u64 = 300; -pub const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 16; +const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 30; + +#[derive(Clone, Debug)] +pub enum TaskSelector { + ByName(String), + ByKey(Vec), + ByRange(Vec, Vec), + All, +} + +impl TaskSelector { + pub fn reference(&self) -> TaskSelectorRef<'_> { + match self { + TaskSelector::ByName(s) => TaskSelectorRef::ByName(s), + TaskSelector::ByKey(k) => TaskSelectorRef::ByKey(k), + TaskSelector::ByRange(s, e) => TaskSelectorRef::ByRange(s, e), + TaskSelector::All => TaskSelectorRef::All, + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum TaskSelectorRef<'a> { + ByName(&'a str), + ByKey(&'a [u8]), + ByRange(&'a [u8], &'a [u8]), + All, +} + +impl<'a> TaskSelectorRef<'a> { + fn matches<'c, 'd>( + self, + task_name: &str, + mut task_range: impl Iterator, + ) -> bool { + match self { + TaskSelectorRef::ByName(name) => task_name == name, + TaskSelectorRef::ByKey(k) => task_range.any(|(s, e)| utils::is_in_range(k, (s, e))), + TaskSelectorRef::ByRange(x1, y1) => { + task_range.any(|(x2, y2)| utils::is_overlapping((x1, y1), (x2, y2))) + } + TaskSelectorRef::All => true, + } + } +} #[derive(Debug)] pub struct ApplyEvent { @@ -76,10 +124,11 @@ pub struct ApplyEvents { } impl ApplyEvents { - /// Convert a [CmdBatch] to a vector of events. Ignoring admin / error commands. - /// At the same time, advancing status of the `Resolver` by those keys. - /// Note: the resolved ts cannot be advanced if there is no command, - /// maybe we also need to update resolved_ts when flushing? + /// Convert a [CmdBatch] to a vector of events. Ignoring admin / error + /// commands. At the same time, advancing status of the `Resolver` by + /// those keys. + /// Note: the resolved ts cannot be advanced if there is no command, maybe + /// we also need to update resolved_ts when flushing? pub fn from_cmd_batch(cmd: CmdBatch, resolver: &mut TwoPhaseResolver) -> Self { let region_id = cmd.region_id; let mut result = vec![]; @@ -193,7 +242,8 @@ impl ApplyEvents { >::borrow(&item).clone(), ApplyEvents { events: { - // assuming the keys in the same region would probably be in one group. + // assuming the keys in the same region would probably be in one + // group. let mut v = Vec::with_capacity(event_len); v.push(event); v @@ -288,12 +338,13 @@ pub struct RouterInner { /// The temporary directory for all tasks. prefix: PathBuf, - /// The handle to Endpoint, we should send `Flush` to endpoint if there are too many temporary files. + /// The handle to Endpoint, we should send `Flush` to endpoint if there are + /// too many temporary files. scheduler: Scheduler, /// The size limit of temporary file per task. - temp_file_size_limit: u64, + temp_file_size_limit: AtomicU64, /// The max duration the local data can be pending. - max_flush_interval: Duration, + max_flush_interval: SyncRwLock, } impl std::fmt::Debug for RouterInner { @@ -318,13 +369,20 @@ impl RouterInner { tasks: Mutex::new(HashMap::default()), prefix, scheduler, - temp_file_size_limit, - max_flush_interval, + temp_file_size_limit: AtomicU64::new(temp_file_size_limit), + max_flush_interval: SyncRwLock::new(max_flush_interval), } } - /// Find the task for a region. If `end_key` is empty, search from start_key to +inf. - /// It simply search for a random possible overlapping range and get its task. + pub fn udpate_config(&self, config: &BackupStreamConfig) { + *self.max_flush_interval.write().unwrap() = config.max_flush_interval.0; + self.temp_file_size_limit + .store(config.file_size_limit.0, Ordering::SeqCst); + } + + /// Find the task for a region. If `end_key` is empty, search from start_key + /// to +inf. It simply search for a random possible overlapping range and + /// get its task. /// FIXME: If a region crosses many tasks, this can only find one of them. pub fn find_task_by_range(&self, start_key: &[u8], mut end_key: &[u8]) -> Option { let r = self.ranges.rl(); @@ -336,11 +394,13 @@ impl RouterInner { } /// Register some ranges associated to some task. - /// Because the observer interface yields encoded data key, the key should be ENCODED DATA KEY too. - /// (i.e. encoded by `Key::from_raw(key).into_encoded()`, [`utils::wrap_key`] could be a shortcut.). - /// We keep ranges in memory to filter kv events not in these ranges. + /// Because the observer interface yields encoded data key, the key should + /// be ENCODED DATA KEY too. (i.e. encoded by + /// `Key::from_raw(key).into_encoded()`, [`utils::wrap_key`] could be + /// a shortcut.). We keep ranges in memory to filter kv events not in + /// these ranges. fn register_ranges(&self, task_name: &str, ranges: Vec<(Vec, Vec)>) { - // TODO reigister ranges to filter kv event + // TODO register ranges to filter kv event // register ranges has two main purpose. // 1. filter kv event that no need to backup // 2. route kv event to the corresponding file. @@ -367,12 +427,21 @@ impl RouterInner { &self, mut task: StreamTask, ranges: Vec<(Vec, Vec)>, + merged_file_size_limit: u64, ) -> Result<()> { + let compression_type = task.info.get_compression_type(); let task_name = task.info.take_name(); // register task info let prefix_path = self.prefix.join(&task_name); - let stream_task = StreamTaskInfo::new(prefix_path, task, self.max_flush_interval).await?; + let stream_task = StreamTaskInfo::new( + prefix_path, + task, + ranges.clone(), + merged_file_size_limit, + compression_type, + ) + .await?; self.tasks .lock() .await @@ -401,6 +470,21 @@ impl RouterInner { r.get_value_by_point(key).cloned() } + pub async fn select_task(&self, selector: TaskSelectorRef<'_>) -> Vec { + let s = self.tasks.lock().await; + s.iter() + .filter(|(name, info)| { + selector.matches( + name.as_str(), + info.ranges + .iter() + .map(|(s, e)| (s.as_slice(), e.as_slice())), + ) + }) + .map(|(name, _)| name.to_owned()) + .collect() + } + #[cfg(test)] pub(crate) async fn must_mut_task_info(&self, task_name: &str, mutator: F) where @@ -429,18 +513,19 @@ impl RouterInner { async fn on_event(&self, task: String, events: ApplyEvents) -> Result<()> { let task_info = self.get_task_info(&task).await?; task_info.on_events(events).await?; + let file_size_limit = self.temp_file_size_limit.load(Ordering::SeqCst); - // When this event make the size of temporary files exceeds the size limit, make a flush. - // Note that we only flush if the size is less than the limit before the event, - // or we may send multiplied flush requests. + // When this event make the size of temporary files exceeds the size limit, make + // a flush. Note that we only flush if the size is less than the limit before + // the event, or we may send multiplied flush requests. debug!( "backup stream statics size"; "task" => ?task, "next_size" => task_info.total_size(), - "size_limit" => self.temp_file_size_limit, + "size_limit" => file_size_limit, ); let cur_size = task_info.total_size(); - if cur_size > self.temp_file_size_limit && !task_info.is_flushing() { + if cur_size > file_size_limit && !task_info.is_flushing() { info!("try flushing task"; "task" => %task, "size" => %cur_size); if task_info.set_flushing_status_cas(false, true).is_ok() { if let Err(e) = self.scheduler.schedule(Task::Flush(task)) { @@ -462,8 +547,8 @@ impl RouterInner { futures::future::join_all(tasks).await } - /// flush the specified task, once once success, return the min resolved ts of this flush. - /// returns `None` if failed. + /// flush the specified task, once once success, return the min resolved ts + /// of this flush. returns `None` if failed. pub async fn do_flush( &self, task_name: &str, @@ -476,7 +561,6 @@ impl RouterInner { let result = task_info.do_flush(store_id, resolve_to).await; // set false to flushing whether success or fail task_info.set_flushing_status(false); - task_info.update_flush_time(); if let Err(e) = result { e.report("failed to flush task."); @@ -485,22 +569,51 @@ impl RouterInner { // NOTE: Maybe we'd better record all errors and send them to the client? try_send!( self.scheduler, - Task::FatalError(task_name.to_owned(), Box::new(e)) + Task::FatalError( + TaskSelector::ByName(task_name.to_owned()), + Box::new(e) + ) ); } return None; } + // if succeed in flushing, update flush_time. Or retry do_flush immediately. + task_info.update_flush_time(); result.ok().flatten() } _ => None, } } + pub async fn update_global_checkpoint( + &self, + task_name: &str, + global_checkpoint: u64, + store_id: u64, + ) -> Result { + self.get_task_info(task_name) + .await? + .update_global_checkpoint(global_checkpoint, store_id) + .await + } + /// tick aims to flush log/meta to extern storage periodically. pub async fn tick(&self) { + let max_flush_interval = self.max_flush_interval.rl().to_owned(); + for (name, task_info) in self.tasks.lock().await.iter() { - // if stream task need flush this time, schedule Task::Flush, or update time justly. - if task_info.should_flush() && task_info.set_flushing_status_cas(false, true).is_ok() { + if let Err(e) = self + .scheduler + .schedule(Task::UpdateGlobalCheckpoint(name.to_string())) + { + error!("backup stream schedule task failed"; "error" => ?e); + } + + // if stream task need flush this time, schedule Task::Flush, or update time + // justly. + if task_info.should_flush(&max_flush_interval) + && task_info.set_flushing_status_cas(false, true).is_ok() + { info!( "backup stream trigger flush task by tick"; "task" => ?task_info, @@ -525,15 +638,22 @@ struct TempFileKey { is_meta: bool, } +pub enum FormatType { + Date, + Hour, +} + impl TempFileKey { - /// Create the key for an event. The key can be used to find which temporary file the event should be stored. + /// Create the key for an event. The key can be used to find which temporary + /// file the event should be stored. fn of(kv: &ApplyEvent, region_id: u64) -> Self { let table_id = if kv.is_meta() { // Force table id of meta key be zero. 0 } else { - // When we cannot extract the table key, use 0 for the table key(perhaps we insert meta key here.). - // Can we elide the copy here(or at least, take a slice of key instead of decoding the whole key)? + // When we cannot extract the table key, use 0 for the table key(perhaps we + // insert meta key here.). Can we elide the copy here(or at least, + // take a slice of key instead of decoding the whole key)? Key::from_encoded_slice(&kv.key) .into_raw() .ok() @@ -550,15 +670,14 @@ impl TempFileKey { } fn get_file_type(&self) -> FileType { - let file_type = match self.cmd_type { + match self.cmd_type { CmdType::Put => FileType::Put, CmdType::Delete => FileType::Delete, _ => { warn!("error cmdtype"; "cmdtype" => ?self.cmd_type); panic!("error CmdType"); } - }; - file_type + } } /// The full name of the file owns the key. @@ -583,49 +702,57 @@ impl TempFileKey { } } - fn format_date_time(ts: u64) -> impl Display { + fn format_date_time(ts: u64, t: FormatType) -> impl Display { use chrono::prelude::*; let millis = TimeStamp::physical(ts.into()); let dt = Utc.timestamp_millis(millis as _); - - #[cfg(feature = "failpoints")] - { - fail::fail_point!("stream_format_date_time", |s| { - return dt - .format(&s.unwrap_or_else(|| "%Y%m".to_owned())) - .to_string(); - }); - return dt.format("%Y%m%d").to_string(); + match t { + FormatType::Date => dt.format("%Y%m%d"), + FormatType::Hour => dt.format("%H"), } - #[cfg(not(feature = "failpoints"))] - return dt.format("%Y%m%d"); } - fn path_to_log_file(&self, min_ts: u64, max_ts: u64) -> String { + /// path_to_log_file specifies the path of record log for v2. + /// ```text + /// V1: v1/${date}/${hour}/${store_id}/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log + /// V2: v1/${date}/${hour}/${store_id}/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log + /// ``` + /// For v2, we merged the small files (partition by table_id) into one file. + fn path_to_log_file(store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( - "v1/t{:08}/{}-{:012}-{}.log", - self.table_id, - // We may delete a range of files, so using the max_ts for preventing remove some records wrong. - Self::format_date_time(max_ts), + "v1/{}/{}/{}/{}-{}.log", + // We may delete a range of files, so using the max_ts for preventing remove some + // records wrong. + Self::format_date_time(max_ts, FormatType::Date), + Self::format_date_time(max_ts, FormatType::Hour), + store_id, min_ts, uuid::Uuid::new_v4() ) } - fn path_to_schema_file(min_ts: u64, max_ts: u64) -> String { + /// path_to_schema_file specifies the path of schema log for v2. + /// ```text + /// V1: v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log + /// V2: v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log + /// ``` + /// For v2, we merged the small files (partition by table_id) into one file. + fn path_to_schema_file(store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( - "v1/schema-meta/{}-{:012}-{}.log", - Self::format_date_time(max_ts), + "v1/{}/{}/{}/schema-meta/{}-{}.log", + Self::format_date_time(max_ts, FormatType::Date), + Self::format_date_time(max_ts, FormatType::Hour), + store_id, min_ts, uuid::Uuid::new_v4(), ) } - fn file_name(&self, min_ts: TimeStamp, max_ts: TimeStamp) -> String { - if self.is_meta { - Self::path_to_schema_file(min_ts.into_inner(), max_ts.into_inner()) + fn file_name(store_id: u64, min_ts: u64, max_ts: u64, is_meta: bool) -> String { + if is_meta { + Self::path_to_schema_file(store_id, min_ts, max_ts) } else { - self.path_to_log_file(min_ts.into_inner(), max_ts.into_inner()) + Self::path_to_log_file(store_id, min_ts, max_ts) } } } @@ -634,28 +761,60 @@ pub struct StreamTaskInfo { pub(crate) task: StreamTask, /// support external storage. eg local/s3. pub(crate) storage: Arc, + /// The listening range of the task. + ranges: Vec<(Vec, Vec)>, /// The parent directory of temporary files. temp_dir: PathBuf, - /// The temporary file index. Both meta (m prefixed keys) and data (t prefixed keys). + /// The temporary file index. Both meta (m prefixed keys) and data (t + /// prefixed keys). files: SlotMap, /// flushing_files contains files pending flush. - flushing_files: RwLock)>>, + flushing_files: RwLock>, + /// flushing_meta_files contains meta files pending flush. + flushing_meta_files: RwLock>, /// last_flush_ts represents last time this task flushed to storage. last_flush_time: AtomicPtr, - /// flush_interval represents the tick interval of flush, setting by users. - flush_interval: Duration, /// The min resolved TS of all regions involved. min_resolved_ts: TimeStamp, /// Total size of all temporary files in byte. total_size: AtomicUsize, - /// This should only be set to `true` by `compare_and_set(current=false, value=ture)`. - /// The thread who setting it to `true` takes the responsibility of sending the request to the - /// scheduler for flushing the files then. + /// This should only be set to `true` by `compare_and_set(current=false, + /// value=true)`. The thread who setting it to `true` takes the + /// responsibility of sending the request to the scheduler for flushing + /// the files then. /// /// If the request failed, that thread can set it to `false` back then. flushing: AtomicBool, /// This counts how many times this task has failed to flush. flush_fail_count: AtomicUsize, + /// global checkpoint ts for this task. + global_checkpoint_ts: AtomicU64, + /// The size limit of the merged file for this task. + merged_file_size_limit: u64, + /// The compression type for this task. + compression_type: CompressionType, +} + +impl Drop for StreamTaskInfo { + fn drop(&mut self) { + let (success, failed): (Vec<_>, Vec<_>) = self + .flushing_files + .get_mut() + .drain(..) + .chain(self.flushing_meta_files.get_mut().drain(..)) + .map(|(_, f, _)| f.local_path) + .map(std::fs::remove_file) + .partition(|r| r.is_ok()); + info!("stream task info dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); + let (success, failed): (Vec<_>, Vec<_>) = self + .files + .get_mut() + .drain() + .map(|(_, f)| f.into_inner().local_path) + .map(std::fs::remove_file) + .partition(|r| r.is_ok()); + info!("stream task info dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); + } } impl std::fmt::Debug for StreamTaskInfo { @@ -675,25 +834,32 @@ impl StreamTaskInfo { pub async fn new( temp_dir: PathBuf, task: StreamTask, - flush_interval: Duration, + ranges: Vec<(Vec, Vec)>, + merged_file_size_limit: u64, + compression_type: CompressionType, ) -> Result { tokio::fs::create_dir_all(&temp_dir).await?; let storage = Arc::from(create_storage( task.info.get_storage(), BackendConfig::default(), )?); + let start_ts = task.info.get_start_ts(); Ok(Self { task, storage, temp_dir, + ranges, min_resolved_ts: TimeStamp::max(), files: SlotMap::default(), flushing_files: RwLock::default(), + flushing_meta_files: RwLock::default(), last_flush_time: AtomicPtr::new(Box::into_raw(Box::new(Instant::now()))), - flush_interval, total_size: AtomicUsize::new(0), flushing: AtomicBool::new(false), flush_fail_count: AtomicUsize::new(0), + global_checkpoint_ts: AtomicU64::new(start_ts), + merged_file_size_limit, + compression_type, }) } @@ -708,11 +874,12 @@ impl StreamTaskInfo { let mut w = self.files.write().await; // double check before insert. there may be someone already insert that // when we are waiting for the write lock. - // slience the lint advising us to use the `Entry` API which may introduce copying. + // silence the lint advising us to use the `Entry` API which may introduce + // copying. #[allow(clippy::map_entry)] if !w.contains_key(&key) { let path = self.temp_dir.join(key.temp_file_name()); - let val = Mutex::new(DataFile::new(path).await?); + let val = Mutex::new(DataFile::new(path, self.compression_type).await?); w.insert(key, val); } @@ -750,26 +917,22 @@ impl StreamTaskInfo { /// Flush all template files and generate corresponding metadata. pub async fn generate_metadata(&self, store_id: u64) -> Result { - let w = self.flushing_files.read().await; + let mut w = self.flushing_files.write().await; + let mut wm = self.flushing_meta_files.write().await; // Let's flush all files first... - futures::future::join_all(w.iter().map(|(_, f)| async move { - let file = &mut f.lock().await.inner; - file.flush().await?; - file.get_ref().sync_all().await?; - Result::Ok(()) - })) + futures::future::join_all( + w.iter_mut() + .chain(wm.iter_mut()) + .map(|(_, f, _)| async move { f.inner.as_mut().done().await }), + ) .await .into_iter() .map(|r| r.map_err(Error::from)) .fold(Ok(()), Result::and)?; - let mut metadata = MetadataInfo::with_capacity(w.len()); + let mut metadata = MetadataInfo::with_capacity(w.len() + wm.len()); metadata.set_store_id(store_id); - for (file_key, data_file) in w.iter() { - let mut data_file = data_file.lock().await; - let file_meta = data_file.generate_metadata(file_key)?; - metadata.push(file_meta) - } + // delay push files until log files are flushed Ok(metadata) } @@ -790,11 +953,11 @@ impl StreamTaskInfo { unsafe { Box::from_raw(ptr) }; } - pub fn should_flush(&self) -> bool { - // When it doesn't flush since 0.8x of auto-flush interval, we get ready to start flushing. - // So that we will get a buffer for the cost of actual flushing. - self.get_last_flush_time().saturating_elapsed_secs() - >= self.flush_interval.as_secs_f64() * 0.8 + pub fn should_flush(&self, flush_interval: &Duration) -> bool { + // When it doesn't flush since 0.8x of auto-flush interval, we get ready to + // start flushing. So that we will get a buffer for the cost of actual + // flushing. + self.get_last_flush_time().saturating_elapsed_secs() >= flush_interval.as_secs_f64() * 0.8 } pub fn is_flushing(&self) -> bool { @@ -802,18 +965,34 @@ impl StreamTaskInfo { } /// move need-flushing files to flushing_files. - pub async fn move_to_flushing_files(&self) -> &Self { + pub async fn move_to_flushing_files(&self) -> Result<&Self> { + // if flushing_files is not empty, which represents this flush is a retry + // operation. + if !self.flushing_files.read().await.is_empty() { + return Ok(self); + } + let mut w = self.files.write().await; let mut fw = self.flushing_files.write().await; + let mut fw_meta = self.flushing_meta_files.write().await; for (k, v) in w.drain() { - fw.push((k, v)); + // we should generate file metadata(calculate sha256) when moving file. + // because sha256 calculation is a unsafe move operation. + // we cannot re-calculate it in retry. + // TODO refactor move_to_flushing_files and generate_metadata + let mut v = v.into_inner(); + let file_meta = v.generate_metadata(&k)?; + if file_meta.is_meta { + fw_meta.push((k, v, file_meta)); + } else { + fw.push((k, v, file_meta)); + } } - self + Ok(self) } pub async fn clear_flushing_files(&self) { - for (_, v) in self.flushing_files.write().await.drain(..) { - let data_file = v.lock().await; + for (_, data_file, _) in self.flushing_files.write().await.drain(..) { debug!("removing data file"; "size" => %data_file.file_size, "name" => %data_file.local_path.display()); self.total_size .fetch_sub(data_file.file_size, Ordering::SeqCst); @@ -822,66 +1001,161 @@ impl StreamTaskInfo { info!("remove template file"; "err" => ?e); } } + for (_, data_file, _) in self.flushing_meta_files.write().await.drain(..) { + debug!("removing meta data file"; "size" => %data_file.file_size, "name" => %data_file.local_path.display()); + self.total_size + .fetch_sub(data_file.file_size, Ordering::SeqCst); + if let Err(e) = data_file.remove_temp_file().await { + // if remove template failed, just skip it. + info!("remove template file"; "err" => ?e); + } + } } - async fn flush_log_file_to( + async fn merge_and_flush_log_files_to( storage: Arc, - file: &Mutex, + files: &[(TempFileKey, DataFile, DataFileInfo)], + metadata: &mut MetadataInfo, + is_meta: bool, ) -> Result<()> { - let data_file = file.lock().await; + let mut data_files_open = Vec::new(); + let mut data_file_infos = Vec::new(); + let mut merged_file_info = DataFileGroup::new(); + let mut stat_length = 0; + let mut max_ts: Option = None; + let mut min_ts: Option = None; + let mut min_resolved_ts: Option = None; + for (_, data_file, file_info) in files { + let mut file_info_clone = file_info.to_owned(); + // Update offset of file_info(DataFileInfo) + // and push it into merged_file_info(DataFileGroup). + file_info_clone.set_range_offset(stat_length); + data_files_open.push({ + let file = File::open(data_file.local_path.clone()).await?; + let compress_length = file.metadata().await?.len(); + stat_length += compress_length; + file_info_clone.set_range_length(compress_length); + file + }); + data_file_infos.push(file_info_clone); + + let rts = file_info.resolved_ts; + min_resolved_ts = min_resolved_ts.map_or(Some(rts), |r| Some(r.min(rts))); + min_ts = min_ts.map_or(Some(file_info.min_ts), |ts| Some(ts.min(file_info.min_ts))); + max_ts = max_ts.map_or(Some(file_info.max_ts), |ts| Some(ts.max(file_info.max_ts))); + } + let min_ts = min_ts.unwrap_or_default(); + let max_ts = max_ts.unwrap_or_default(); + merged_file_info.set_path(TempFileKey::file_name( + metadata.store_id, + min_ts, + max_ts, + is_meta, + )); + merged_file_info.set_data_files_info(data_file_infos.into()); + merged_file_info.set_length(stat_length); + merged_file_info.set_max_ts(max_ts); + merged_file_info.set_min_ts(min_ts); + merged_file_info.set_min_resolved_ts(min_resolved_ts.unwrap_or_default()); + // to do: limiter to storage let limiter = Limiter::builder(std::f64::INFINITY).build(); - let reader = File::open(data_file.local_path.clone()).await?; - let stat = reader.metadata().await?; - let reader = UnpinReader(Box::new(limiter.limit(reader.compat()))); - let filepath = &data_file.storage_path; - // Once we cannot get the stat of the file, use 4K I/O. - let est_len = stat.len().max(4096); - - let ret = storage.write(filepath, reader, est_len).await; + + let files_reader = FilesReader::new(data_files_open); + + let reader = UnpinReader(Box::new(limiter.limit(files_reader.compat()))); + let filepath = &merged_file_info.path; + + let ret = storage.write(filepath, reader, stat_length).await; + match ret { Ok(_) => { debug!( "backup stream flush success"; - "tmp file" => ?data_file.local_path, "storage file" => ?filepath, + "est_len" => ?stat_length, ); } Err(e) => { warn!("backup stream flush failed"; - "file" => ?data_file.local_path, - "est_len" => ?est_len, + "est_len" => ?stat_length, "err" => ?e, ); return Err(Error::Io(e)); } } + + // push merged file into metadata + metadata.push(merged_file_info); Ok(()) } - pub async fn flush_log(&self) -> Result<()> { - // if failed to write storage, we should retry write flushing_files. + pub async fn flush_log(&self, metadata: &mut MetadataInfo) -> Result<()> { let storage = self.storage.clone(); - let files = self.flushing_files.write().await; - let futs = files - .iter() - .map(|(_, v)| Self::flush_log_file_to(storage.clone(), v)); - futures::future::try_join_all(futs).await?; + self.merge_log(metadata, storage.clone(), &self.flushing_files, false) + .await?; + self.merge_log(metadata, storage.clone(), &self.flushing_meta_files, true) + .await?; + Ok(()) } - pub async fn flush_meta(&self, metadata_info: MetadataInfo) -> Result<()> { - let meta_path = metadata_info.path_to_meta(); - let meta_buff = metadata_info.marshal_to()?; - let buflen = meta_buff.len(); + async fn merge_log( + &self, + metadata: &mut MetadataInfo, + storage: Arc, + files_lock: &RwLock>, + is_meta: bool, + ) -> Result<()> { + let files = files_lock.write().await; + let mut batch_size = 0; + // file[batch_begin_index, i) is a batch + let mut batch_begin_index = 0; + // TODO: upload the merged file concurrently, + // then collect merged_file_infos and push them into `metadata`. + for (i, (_, _, info)) in files.iter().enumerate() { + if batch_size >= self.merged_file_size_limit { + Self::merge_and_flush_log_files_to( + storage.clone(), + &files[batch_begin_index..i], + metadata, + is_meta, + ) + .await?; - self.storage - .write( - &meta_path, - UnpinReader(Box::new(Cursor::new(meta_buff))), - buflen as _, + batch_begin_index = i; + batch_size = 0; + } + + batch_size += info.length; + } + if batch_begin_index < files.len() { + Self::merge_and_flush_log_files_to( + storage.clone(), + &files[batch_begin_index..], + metadata, + is_meta, ) .await?; + } + + Ok(()) + } + + pub async fn flush_meta(&self, metadata_info: MetadataInfo) -> Result<()> { + if !metadata_info.file_groups.is_empty() { + let meta_path = metadata_info.path_to_meta(); + let meta_buff = metadata_info.marshal_to()?; + let buflen = meta_buff.len(); + + self.storage + .write( + &meta_path, + UnpinReader(Box::new(Cursor::new(meta_buff))), + buflen as _, + ) + .await?; + } Ok(()) } @@ -892,8 +1166,9 @@ impl StreamTaskInfo { /// execute the flush: copy local files to external storage. /// if success, return the last resolved ts of this flush. - /// The caller can try to advance the resolved ts and provide it to the function, - /// and we would use max(resolved_ts_provided, resolved_ts_from_file). + /// The caller can try to advance the resolved ts and provide it to the + /// function, and we would use `max(resolved_ts_provided, + /// resolved_ts_from_file)`. pub async fn do_flush( &self, store_id: u64, @@ -905,29 +1180,33 @@ impl StreamTaskInfo { return Ok(None); } let begin = Instant::now_coarse(); - let mut sw = StopWatch::new(); + let mut sw = StopWatch::by_now(); // generate meta data and prepare to flush to storage let mut metadata_info = self .move_to_flushing_files() - .await + .await? .generate_metadata(store_id) .await?; - metadata_info.min_resolved_ts = metadata_info - .min_resolved_ts - .max(Some(resolved_ts_provided.into_inner())); - let rts = metadata_info.min_resolved_ts; crate::metrics::FLUSH_DURATION .with_label_values(&["generate_metadata"]) .observe(sw.lap().as_secs_f64()); // flush log file to storage. - self.flush_log().await?; + self.flush_log(&mut metadata_info).await?; + + // the field `min_resolved_ts` of metadata will be updated + // only after flush is done. + metadata_info.min_resolved_ts = metadata_info + .min_resolved_ts + .max(Some(resolved_ts_provided.into_inner())); + let rts = metadata_info.min_resolved_ts; + // compress length let file_size_vec = metadata_info - .files + .file_groups .iter() - .map(|d| d.length) + .map(|d| (d.length, d.data_files_info.len())) .collect::>(); // flush meta file to storage. self.flush_meta(metadata_info).await?; @@ -942,10 +1221,11 @@ impl StreamTaskInfo { .observe(sw.lap().as_secs_f64()); file_size_vec .iter() - .for_each(|size| crate::metrics::FLUSH_FILE_SIZE.observe(*size as _)); + .for_each(|(size, _)| crate::metrics::FLUSH_FILE_SIZE.observe(*size as _)); info!("log backup flush done"; - "files" => %file_size_vec.len(), - "total_size" => %file_size_vec.iter().sum::(), + "merged_files" => %file_size_vec.len(), // the number of the merged files + "files" => %file_size_vec.iter().map(|(_, v)| v).sum::(), + "total_size" => %file_size_vec.iter().map(|(v, _)| v).sum::(), // the size of the merged files after compressed "take" => ?begin.saturating_elapsed(), ); Ok(rts) @@ -960,6 +1240,43 @@ impl StreamTaskInfo { result } + + pub async fn flush_global_checkpoint(&self, store_id: u64) -> Result<()> { + let filename = format!("v1/global_checkpoint/{}.ts", store_id); + let buff = self + .global_checkpoint_ts + .load(Ordering::SeqCst) + .to_le_bytes(); + self.storage + .write( + &filename, + UnpinReader(Box::new(Cursor::new(buff))), + buff.len() as _, + ) + .await?; + Ok(()) + } + + pub async fn update_global_checkpoint( + &self, + global_checkpoint: u64, + store_id: u64, + ) -> Result { + let last_global_checkpoint = self.global_checkpoint_ts.load(Ordering::SeqCst); + if last_global_checkpoint < global_checkpoint { + let r = self.global_checkpoint_ts.compare_exchange( + last_global_checkpoint, + global_checkpoint, + Ordering::SeqCst, + Ordering::SeqCst, + ); + if r.is_ok() { + self.flush_global_checkpoint(store_id).await?; + return Ok(true); + } + } + Ok(false) + } } /// A opened log file with some metadata. @@ -967,28 +1284,36 @@ struct DataFile { min_ts: TimeStamp, max_ts: TimeStamp, resolved_ts: TimeStamp, + min_begin_ts: Option, sha256: Hasher, - inner: BufWriter, + // TODO: use lz4 with async feature + inner: Pin>, + compression_type: CompressionType, start_key: Vec, end_key: Vec, number_of_entries: usize, file_size: usize, local_path: PathBuf, - storage_path: String, } #[derive(Debug)] pub struct MetadataInfo { - pub files: Vec, + // the field files is deprecated in v6.3.0 + // pub files: Vec, + pub file_groups: Vec, pub min_resolved_ts: Option, + pub min_ts: Option, + pub max_ts: Option, pub store_id: u64, } impl MetadataInfo { fn with_capacity(cap: usize) -> Self { Self { - files: Vec::with_capacity(cap), + file_groups: Vec::with_capacity(cap), min_resolved_ts: None, + min_ts: None, + max_ts: None, store_id: 0, } } @@ -997,17 +1322,26 @@ impl MetadataInfo { self.store_id = store_id; } - fn push(&mut self, file: DataFileInfo) { - let rts = file.resolved_ts; + fn push(&mut self, file: DataFileGroup) { + let rts = file.min_resolved_ts; self.min_resolved_ts = self.min_resolved_ts.map_or(Some(rts), |r| Some(r.min(rts))); - self.files.push(file); + self.min_ts = self + .min_ts + .map_or(Some(file.min_ts), |ts| Some(ts.min(file.min_ts))); + self.max_ts = self + .max_ts + .map_or(Some(file.max_ts), |ts| Some(ts.max(file.max_ts))); + self.file_groups.push(file); } fn marshal_to(self) -> Result> { let mut metadata = Metadata::new(); - metadata.set_files(self.files.into()); + metadata.set_file_groups(self.file_groups.into()); metadata.set_store_id(self.store_id as _); - metadata.set_resolved_ts(self.min_resolved_ts.unwrap_or_default() as _); + metadata.set_resolved_ts(self.min_resolved_ts.unwrap_or_default()); + metadata.set_min_ts(self.min_ts.unwrap_or(0)); + metadata.set_max_ts(self.max_ts.unwrap_or(0)); + metadata.set_meta_version(MetaVersion::V2); metadata .write_to_bytes() @@ -1016,7 +1350,7 @@ impl MetadataInfo { fn path_to_meta(&self) -> String { format!( - "v1/backupmeta/{:012}-{}.meta", + "v1/backupmeta/{}-{}.meta", self.min_resolved_ts.unwrap_or_default(), uuid::Uuid::new_v4() ) @@ -1026,21 +1360,24 @@ impl MetadataInfo { impl DataFile { /// create and open a logfile at the path. /// Note: if a file with same name exists, would truncate it. - async fn new(local_path: impl AsRef) -> Result { + async fn new(local_path: impl AsRef, compression_type: CompressionType) -> Result { let sha256 = Hasher::new(MessageDigest::sha256()) .map_err(|err| Error::Other(box_err!("openssl hasher failed to init: {}", err)))?; + let inner = + utils::compression_writer_dispatcher(local_path.as_ref(), compression_type).await?; Ok(Self { min_ts: TimeStamp::max(), max_ts: TimeStamp::zero(), resolved_ts: TimeStamp::zero(), - inner: BufWriter::with_capacity(128 * 1024, File::create(local_path.as_ref()).await?), + min_begin_ts: None, + inner, + compression_type, sha256, number_of_entries: 0, file_size: 0, start_key: vec![], end_key: vec![], local_path: local_path.as_ref().to_owned(), - storage_path: String::default(), }) } @@ -1048,10 +1385,23 @@ impl DataFile { remove_file(&self.local_path).await } + fn decode_begin_ts(value: Vec) -> Result { + WriteRef::parse(&value).map_or_else( + |e| { + Err(Error::Other(box_err!( + "failed to parse write cf value: {}", + e + ))) + }, + |w| Ok(w.start_ts), + ) + } + /// Add a new KV pair to the file, returning its size. async fn on_events(&mut self, events: ApplyEvents) -> Result { let now = Instant::now_coarse(); let mut total_size = 0; + for mut event in events.events { let encoded = EventEncoder::encode_event(&event.key, &event.value); let mut size = 0; @@ -1069,6 +1419,13 @@ impl DataFile { self.min_ts = self.min_ts.min(ts); self.max_ts = self.max_ts.max(ts); self.resolved_ts = self.resolved_ts.max(events.region_resolved_ts.into()); + + // decode_begin_ts is used to maintain the txn when restore log. + // if value is empty, no need to decode begin_ts. + if event.cf == CF_WRITE && !event.value.is_empty() { + let begin_ts = Self::decode_begin_ts(event.value)?; + self.min_begin_ts = Some(self.min_begin_ts.map_or(begin_ts, |ts| ts.min(begin_ts))); + } self.number_of_entries += 1; self.file_size += size; self.update_key_bound(key.into_encoded()); @@ -1096,15 +1453,11 @@ impl DataFile { } } - /// generage path for log file before flushing to Storage - fn set_storage_path(&mut self, path: String) { - self.storage_path = path; - } - - /// generate the metadata in protocol buffer of the file. + /// generate the metadata v2 where each file becomes a part of the merged + /// file. fn generate_metadata(&mut self, file_key: &TempFileKey) -> Result { - self.set_storage_path(file_key.file_name(self.min_ts, self.max_ts)); - + // Note: the field `storage_path` is empty!!! It will be stored in the upper + // layer `DataFileGroup`. let mut meta = DataFileInfo::new(); meta.set_sha256( self.sha256 @@ -1112,11 +1465,14 @@ impl DataFile { .map(|bytes| bytes.to_vec()) .map_err(|err| Error::Other(box_err!("openssl hasher failed to init: {}", err)))?, ); - meta.set_path(self.storage_path.clone()); meta.set_number_of_entries(self.number_of_entries as _); meta.set_max_ts(self.max_ts.into_inner() as _); meta.set_min_ts(self.min_ts.into_inner() as _); meta.set_resolved_ts(self.resolved_ts.into_inner() as _); + meta.set_min_begin_ts_in_default_cf( + self.min_begin_ts + .map_or(self.min_ts.into_inner(), |ts| ts.into_inner()), + ); meta.set_start_key(std::mem::take(&mut self.start_key)); meta.set_end_key(std::mem::take(&mut self.end_key)); meta.set_length(self.file_size as _); @@ -1127,6 +1483,8 @@ impl DataFile { meta.set_region_id(file_key.region_id as i64); meta.set_type(file_key.get_file_type()); + meta.set_compression_type(self.compression_type); + Ok(meta) } } @@ -1156,14 +1514,20 @@ struct TaskRange { mod tests { use std::{ffi::OsStr, time::Duration}; + use external_storage::{ExternalData, NoopStorage}; + use futures::AsyncReadExt; use kvproto::brpb::{Local, Noop, StorageBackend, StreamBackupTaskInfo}; + use online_config::{ConfigManager, OnlineConfig}; use tikv_util::{ codec::number::NumberEncoder, + config::ReadableDuration, worker::{dummy_scheduler, ReceiverWrapper}, }; + use tokio::fs::File; + use txn_types::{Write, WriteType}; use super::*; - use crate::utils; + use crate::{config::BackupStreamConfigManager, utils}; #[derive(Debug)] struct KvEventsBuilder { @@ -1181,6 +1545,12 @@ mod tests { table_key } + fn make_value(t: WriteType, value: &[u8], start_ts: u64) -> Vec { + let start_ts = TimeStamp::new(start_ts); + let w = Write::new(t, start_ts, Some(value.to_vec())); + w.as_ref().to_bytes() + } + impl KvEventsBuilder { fn new(region_id: u64, region_resolved_ts: u64) -> Self { Self { @@ -1219,9 +1589,14 @@ mod tests { }) } - fn put_table(&mut self, cf: &'static str, table: i64, key: &[u8], value: &[u8]) { + fn put_table(&mut self, cf: CfName, table: i64, key: &[u8], value: &[u8]) { let table_key = make_table_key(table, key); - self.put_event(cf, table_key, value.to_vec()); + let value = if cf == CF_WRITE { + make_value(WriteType::Put, value, 12345) + } else { + value.to_vec() + }; + self.put_event(cf, table_key, value); } fn delete_table(&mut self, cf: &'static str, table: i64, key: &[u8]) { @@ -1229,7 +1604,7 @@ mod tests { self.delete_event(cf, table_key); } - fn flush_events(&mut self) -> ApplyEvents { + fn finish(&mut self) -> ApplyEvents { let region_id = self.events.region_id; let region_resolved_ts = self.events.region_resolved_ts; std::mem::replace( @@ -1313,6 +1688,7 @@ mod tests { utils::wrap_key(make_table_key(table_id, b"")), utils::wrap_key(make_table_key(table_id + 1, b"")), )], + 0x100000, ) .await .expect("failed to register task") @@ -1321,21 +1697,12 @@ mod tests { fn check_on_events_result(item: &Vec<(String, Result<()>)>) { for (task, r) in item { if let Err(err) = r { - panic!("task {} failed: {}", task, err); + warn!("task {} failed: {}", task, err); } } } - #[tokio::test] - async fn test_basic_file() -> Result<()> { - test_util::init_log_for_test(); - let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); - tokio::fs::create_dir_all(&tmp).await?; - let (tx, rx) = dummy_scheduler(); - let router = RouterInner::new(tmp.clone(), tx, 32, Duration::from_secs(300)); - let (stream_task, storage_path) = task("dummy".to_owned()).await?; - must_register_table(&router, stream_task, 1).await; - + async fn write_simple_data(router: &RouterInner) -> u64 { let now = TimeStamp::physical_now(); let mut region1 = KvEventsBuilder::new(1, now); let start_ts = TimeStamp::physical_now(); @@ -1346,30 +1713,75 @@ mod tests { region1.put_table(CF_WRITE, 2, b"hello", b"this isn't a write record :3"); region1.put_table(CF_WRITE, 1, b"hello", b"still isn't a write record :3"); region1.delete_table(CF_DEFAULT, 1, b"hello"); - let events = region1.flush_events(); + let events = region1.finish(); check_on_events_result(&router.on_events(events).await); + start_ts + } + + #[tokio::test] + async fn test_basic_file() -> Result<()> { + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + tokio::fs::create_dir_all(&tmp).await?; + let (tx, rx) = dummy_scheduler(); + let router = RouterInner::new(tmp.clone(), tx, 32, Duration::from_secs(300)); + let (stream_task, storage_path) = task("dummy".to_owned()).await?; + must_register_table(&router, stream_task, 1).await; + + let start_ts = write_simple_data(&router).await; tokio::time::sleep(Duration::from_millis(200)).await; let end_ts = TimeStamp::physical_now(); let files = router.tasks.lock().await.get("dummy").unwrap().clone(); - let meta = files + let mut meta = files .move_to_flushing_files() - .await + .await? .generate_metadata(1) .await?; - assert_eq!(meta.files.len(), 3, "test file len = {}", meta.files.len()); + assert!( - meta.files.iter().all(|item| { - TimeStamp::new(item.min_ts as _).physical() >= start_ts - && TimeStamp::new(item.max_ts as _).physical() <= end_ts - && item.min_ts <= item.max_ts - }), + meta.file_groups + .iter() + .all(|group| group.data_files_info.iter().all(|item| { + TimeStamp::new(item.min_ts as _).physical() >= start_ts + && TimeStamp::new(item.max_ts as _).physical() <= end_ts + && item.min_ts <= item.max_ts + })), "meta = {:#?}; start ts = {}, end ts = {}", - meta.files, + meta.file_groups, start_ts, end_ts ); - files.flush_log().await?; + + // in some case when flush failed to write files to storage. + // we may run `generate_metadata` again with same files. + let mut another_meta = files + .move_to_flushing_files() + .await? + .generate_metadata(1) + .await?; + + files.flush_log(&mut meta).await?; + files.flush_log(&mut another_meta).await?; + // meta updated + let files_num = meta + .file_groups + .iter() + .map(|v| v.data_files_info.len()) + .sum::(); + assert_eq!(files_num, 3, "test file len = {}", files_num); + for i in 0..meta.file_groups.len() { + let file_groups1 = meta.file_groups.get(i).unwrap(); + let file_groups2 = another_meta.file_groups.get(i).unwrap(); + // we have to make sure two times sha256 of file must be the same. + for j in 0..file_groups1.data_files_info.len() { + let file1 = file_groups1.data_files_info.get(j).unwrap(); + let file2 = file_groups2.data_files_info.get(j).unwrap(); + assert_eq!(file1.sha256, file2.sha256); + assert_eq!(file1.start_key, file2.start_key); + assert_eq!(file1.end_key, file2.end_key); + } + } + files.flush_meta(meta).await?; files.clear_flushing_files().await; @@ -1378,7 +1790,7 @@ mod tests { assert_eq!(cmds.len(), 1, "test cmds len = {}", cmds.len()); match &cmds[0] { Task::Flush(task) => assert_eq!(task, "dummy", "task = {}", task), - _ => panic!("the cmd isn't flush!"), + _ => warn!("the cmd isn't flush!"), } let mut meta_count = 0; @@ -1402,10 +1814,70 @@ mod tests { } assert_eq!(meta_count, 1); - assert_eq!(log_count, 3); + assert_eq!(log_count, 2); // flush twice Ok(()) } + fn mock_build_large_kv_events(table_id: i64, region_id: u64, resolved_ts: u64) -> ApplyEvents { + let mut events_builder = KvEventsBuilder::new(region_id, resolved_ts); + events_builder.put_table( + "default", + table_id, + b"hello", + "world".repeat(1024).as_bytes(), + ); + events_builder.finish() + } + + #[tokio::test] + async fn test_do_flush() { + let tmp_dir = tempfile::tempdir().unwrap(); + let backend = external_storage_export::make_local_backend(tmp_dir.path()); + let mut task_info = StreamBackupTaskInfo::default(); + task_info.set_storage(backend); + let stream_task = StreamTask { + info: task_info, + is_paused: false, + }; + let merged_file_size_limit = 0x10000; + let task = StreamTaskInfo::new( + tmp_dir.path().to_path_buf(), + stream_task, + vec![(vec![], vec![])], + merged_file_size_limit, + CompressionType::Zstd, + ) + .await + .unwrap(); + + // on_event + let region_count = merged_file_size_limit / (4 * 1024); // 2 merged log files + for i in 1..=region_count { + let kv_events = mock_build_large_kv_events(i as _, i as _, i as _); + task.on_events(kv_events).await.unwrap(); + } + // do_flush + task.set_flushing_status(true); + task.do_flush(1, TimeStamp::new(1)).await.unwrap(); + assert_eq!(task.flush_failure_count(), 0); + assert_eq!(task.files.read().await.is_empty(), true); + assert_eq!(task.flushing_files.read().await.is_empty(), true); + + // assert backup log files + let mut meta_count = 0; + let mut log_count = 0; + for entry in walkdir::WalkDir::new(tmp_dir.path()) { + let entry = entry.unwrap(); + if entry.path().extension() == Some(OsStr::new("meta")) { + meta_count += 1; + } else if entry.path().extension() == Some(OsStr::new("log")) { + log_count += 1; + } + } + assert_eq!(meta_count, 1); + assert_eq!(log_count, 2); + } + struct ErrorStorage { inner: Inner, error_on_write: Box io::Result<()> + Send + Sync>, @@ -1459,15 +1931,17 @@ mod tests { reader: UnpinReader, content_length: u64, ) -> io::Result<()> { - if let Err(e) = (self.error_on_write)() { - return Err(e); - } + (self.error_on_write)()?; self.inner.write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.inner.read(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { + self.inner.read_part(name, off, len) + } } fn build_kv_event(base: i32, count: i32) -> ApplyEvents { @@ -1485,7 +1959,6 @@ mod tests { #[tokio::test] async fn test_flush_with_error() -> Result<()> { - test_util::init_log_for_test(); let (tx, _rx) = dummy_scheduler(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); let router = Arc::new(RouterInner::new( @@ -1509,15 +1982,18 @@ mod tests { .is_none() ); check_on_events_result(&router.on_events(build_kv_event(10, 10)).await); - let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; let t = router.get_task_info("error_prone").await.unwrap(); + let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; + assert_eq!(t.total_size() > 0, true); + + t.set_flushing_status(true); + let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; assert_eq!(t.total_size(), 0); Ok(()) } #[tokio::test] async fn test_empty_resolved_ts() { - test_util::init_log_for_test(); let (tx, _rx) = dummy_scheduler(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); let router = RouterInner::new(tmp.clone(), tx, 32, Duration::from_secs(300)); @@ -1532,6 +2008,7 @@ mod tests { is_paused: false, }, vec![], + 0x100000, ) .await .unwrap(); @@ -1542,9 +2019,49 @@ mod tests { assert_eq!(ts.into_inner(), rts); } + #[tokio::test] + async fn test_cleanup_when_stop() -> Result<()> { + let (tx, _rx) = dummy_scheduler(); + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let router = Arc::new(RouterInner::new( + tmp.clone(), + tx, + 1, + Duration::from_secs(300), + )); + let (task, _path) = task("cleanup_test".to_owned()).await?; + must_register_table(&router, task, 1).await; + write_simple_data(&router).await; + router + .get_task_info("cleanup_test") + .await? + .move_to_flushing_files() + .await?; + write_simple_data(&router).await; + let mut w = walkdir::WalkDir::new(&tmp).into_iter(); + assert!(w.next().is_some(), "the temp files doesn't created"); + drop(router); + let w = walkdir::WalkDir::new(&tmp) + .into_iter() + .filter_map(|entry| { + let e = entry.unwrap(); + e.path() + .extension() + .filter(|x| x.to_string_lossy() == "log") + .map(|_| e.clone()) + }) + .collect::>(); + + assert!( + w.is_empty(), + "the temp files should be removed, but it is {:?}", + w + ); + Ok(()) + } + #[tokio::test] async fn test_flush_with_pausing_self() -> Result<()> { - test_util::init_log_for_test(); let (tx, rx) = dummy_scheduler(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); let router = Arc::new(RouterInner::new( @@ -1560,8 +2077,8 @@ mod tests { i.storage = Arc::new(ErrorStorage::with_always_error(i.storage.clone())) }) .await; - for i in 0..=16 { - check_on_events_result(&router.on_events(build_kv_event(i * 10, 10)).await); + for i in 0..=FLUSH_FAILURE_BECOME_FATAL_THRESHOLD { + check_on_events_result(&router.on_events(build_kv_event((i * 10) as _, 10)).await); assert_eq!( router .do_flush("flush_failure", 42, TimeStamp::zero()) @@ -1573,7 +2090,7 @@ mod tests { assert!( messages.iter().any(|task| { if let Task::FatalError(name, _err) = task { - return name == "flush_failure"; + return matches!(name.reference(), TaskSelectorRef::ByName("flush_failure")); } false }), @@ -1585,9 +2102,268 @@ mod tests { #[test] fn test_format_datetime() { - test_util::init_log_for_test(); - let s = TempFileKey::format_date_time(431656320867237891); + let s = TempFileKey::format_date_time(431656320867237891, FormatType::Date); let s = s.to_string(); assert_eq!(s, "20220307"); + + let s = TempFileKey::format_date_time(431656320867237891, FormatType::Hour); + assert_eq!(s.to_string(), "07"); + } + + #[test] + fn test_decode_begin_ts() { + let start_ts = TimeStamp::new(12345678); + let w = Write::new(WriteType::Put, start_ts, Some(b"short_value".to_vec())); + let value = w.as_ref().to_bytes(); + + let begin_ts = DataFile::decode_begin_ts(value).unwrap(); + assert_eq!(begin_ts, start_ts); + } + + #[test] + fn test_selector() { + type DummyTask<'a> = (&'a str, &'a [(&'a [u8], &'a [u8])]); + + #[derive(Debug, Clone, Copy)] + struct Case<'a /* 'static */> { + tasks: &'a [DummyTask<'a>], + selector: TaskSelectorRef<'a>, + selected: &'a [&'a str], + } + + let cases = [ + Case { + tasks: &[("Zhao", &[(b"", b"")]), ("Qian", &[(b"", b"")])], + selector: TaskSelectorRef::ByName("Zhao"), + selected: &["Zhao"], + }, + Case { + tasks: &[ + ("Zhao", &[(b"0001", b"1000"), (b"2000", b"")]), + ("Qian", &[(b"0002", b"1000")]), + ], + selector: TaskSelectorRef::ByKey(b"0001"), + selected: &["Zhao"], + }, + Case { + tasks: &[ + ("Zhao", &[(b"0001", b"1000"), (b"2000", b"")]), + ("Qian", &[(b"0002", b"1000")]), + ("Sun", &[(b"0004", b"1024")]), + ("Li", &[(b"1001", b"2048")]), + ], + selector: TaskSelectorRef::ByRange(b"1001", b"2000"), + selected: &["Sun", "Li"], + }, + Case { + tasks: &[ + ("Zhao", &[(b"0001", b"1000"), (b"2000", b"")]), + ("Qian", &[(b"0002", b"1000")]), + ("Sun", &[(b"0004", b"1024")]), + ("Li", &[(b"1001", b"2048")]), + ], + selector: TaskSelectorRef::All, + selected: &["Zhao", "Qian", "Sun", "Li"], + }, + ]; + + fn run(c: Case<'static>) { + assert!( + c.tasks + .iter() + .filter(|(name, range)| c.selector.matches(name, range.iter().copied())) + .map(|(name, _)| name) + .collect::>() + == c.selected.iter().collect::>(), + "case = {:?}", + c + ) + } + + for case in cases { + run(case) + } + } + + #[tokio::test] + async fn test_update_global_checkpoint() -> Result<()> { + // create local storage + let tmp_dir = tempfile::tempdir().unwrap(); + let backend = external_storage_export::make_local_backend(tmp_dir.path()); + + // build a StreamTaskInfo + let mut task_info = StreamBackupTaskInfo::default(); + task_info.set_storage(backend); + let stream_task = StreamTask { + info: task_info, + is_paused: false, + }; + let task = StreamTaskInfo::new( + tmp_dir.path().to_path_buf(), + stream_task, + vec![(vec![], vec![])], + 0x100000, + CompressionType::Zstd, + ) + .await + .unwrap(); + task.global_checkpoint_ts.store(10001, Ordering::SeqCst); + + // test no need to update global checkpoint + let store_id = 3; + let mut global_checkpoint = 10000; + let is_updated = task + .update_global_checkpoint(global_checkpoint, store_id) + .await?; + assert_eq!(is_updated, false); + assert_eq!(task.global_checkpoint_ts.load(Ordering::SeqCst), 10001); + + // test update global checkpoint + global_checkpoint = 10002; + let is_updated = task + .update_global_checkpoint(global_checkpoint, store_id) + .await?; + assert_eq!(is_updated, true); + assert_eq!( + task.global_checkpoint_ts.load(Ordering::SeqCst), + global_checkpoint + ); + + let filename = format!("v1/global_checkpoint/{}.ts", store_id); + let filepath = tmp_dir.as_ref().join(filename); + let exist = file_system::file_exists(filepath.clone()); + assert_eq!(exist, true); + + let buff = file_system::read(filepath).unwrap(); + assert_eq!(buff.len(), 8); + let mut ts = [b'0'; 8]; + ts.copy_from_slice(&buff); + let ts = u64::from_le_bytes(ts); + assert_eq!(ts, global_checkpoint); + Ok(()) + } + + struct MockCheckContentStorage { + s: NoopStorage, + } + + #[async_trait::async_trait] + impl ExternalStorage for MockCheckContentStorage { + fn name(&self) -> &'static str { + self.s.name() + } + + fn url(&self) -> io::Result { + self.s.url() + } + + async fn write( + &self, + _name: &str, + mut reader: UnpinReader, + content_length: u64, + ) -> io::Result<()> { + let mut data = Vec::new(); + reader.0.read_to_end(&mut data).await?; + let data_len: u64 = data.len() as _; + + if data_len == content_length { + Ok(()) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + "the length of content in reader is not equal with content_length", + )) + } + } + + fn read(&self, name: &str) -> external_storage::ExternalData<'_> { + self.s.read(name) + } + + fn read_part(&self, name: &str, off: u64, len: u64) -> external_storage::ExternalData<'_> { + self.s.read_part(name, off, len) + } + } + + #[tokio::test] + async fn test_est_len_in_flush() -> Result<()> { + use tokio::io::AsyncWriteExt; + let noop_s = NoopStorage::default(); + let ms = MockCheckContentStorage { s: noop_s }; + let file_path = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let mut f = File::create(file_path.clone()).await?; + f.write_all("test-data".as_bytes()).await?; + + let data_file = DataFile::new(file_path, CompressionType::Zstd) + .await + .unwrap(); + let info = DataFileInfo::new(); + + let mut meta = MetadataInfo::with_capacity(1); + let kv_event = build_kv_event(1, 1); + let tmp_key = TempFileKey::of(&kv_event.events[0], 1); + let files = vec![(tmp_key, data_file, info)]; + let result = StreamTaskInfo::merge_and_flush_log_files_to( + Arc::new(ms), + &files[0..], + &mut meta, + false, + ) + .await; + assert_eq!(result.is_ok(), true); + Ok(()) + } + + #[test] + fn test_update_config() { + let (sched, rx) = dummy_scheduler(); + let cfg = BackupStreamConfig::default(); + let router = Arc::new(RouterInner::new( + PathBuf::new(), + sched.clone(), + 1, + cfg.max_flush_interval.0, + )); + + let mut cfg_manager = BackupStreamConfigManager::new(sched, cfg.clone()); + + let _new_cfg = BackupStreamConfig { + max_flush_interval: ReadableDuration::minutes(2), + ..Default::default() + }; + + let changed = cfg.diff(&_new_cfg); + cfg_manager.dispatch(changed).unwrap(); + + let cmds = collect_recv(rx); + assert_eq!(cmds.len(), 1); + match &cmds[0] { + Task::ChangeConfig(cfg) => { + assert!(matches!(cfg, _new_cfg)); + router.udpate_config(cfg); + assert_eq!( + router.max_flush_interval.rl().to_owned(), + _new_cfg.max_flush_interval.0 + ); + } + _ => panic!("unexpected cmd!"), + } + } + + #[test] + fn test_udpate_invalid_config() { + let cfg = BackupStreamConfig::default(); + let (sched, _) = dummy_scheduler(); + let mut cfg_manager = BackupStreamConfigManager::new(sched, cfg.clone()); + + let new_cfg = BackupStreamConfig { + max_flush_interval: ReadableDuration::secs(0), + ..Default::default() + }; + + let changed = cfg.diff(&new_cfg); + let r = cfg_manager.dispatch(changed); + assert!(r.is_err()); } } diff --git a/components/backup-stream/src/service.rs b/components/backup-stream/src/service.rs new file mode 100644 index 00000000000..43d4ede2f27 --- /dev/null +++ b/components/backup-stream/src/service.rs @@ -0,0 +1,109 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::collections::HashSet; + +use grpcio::RpcContext; +use kvproto::{logbackuppb::*, metapb::Region}; +use tikv_util::{warn, worker::Scheduler}; + +use crate::{ + checkpoint_manager::{GetCheckpointResult, RegionIdWithVersion}, + endpoint::{RegionCheckpointOperation, RegionSet}, + try_send, Task, +}; + +#[derive(Clone)] +pub struct Service { + endpoint: Scheduler, +} + +impl Service { + pub fn new(endpoint: Scheduler) -> Self { + Self { endpoint } + } +} + +fn id_of(region: &Region) -> RegionIdentity { + let mut id = RegionIdentity::new(); + id.set_id(region.get_id()); + id.set_epoch_version(region.get_region_epoch().get_version()); + id +} + +impl From for RegionIdentity { + fn from(val: RegionIdWithVersion) -> Self { + let mut id = RegionIdentity::new(); + id.set_id(val.region_id); + id.set_epoch_version(val.region_epoch_version); + id + } +} + +impl LogBackup for Service { + fn get_last_flush_ts_of_region( + &mut self, + _ctx: RpcContext<'_>, + mut req: GetLastFlushTsOfRegionRequest, + sink: grpcio::UnarySink, + ) { + let regions = req + .take_regions() + .into_iter() + .map(|id| (id.id, id.epoch_version)) + .collect::>(); + let t = Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Regions(regions), + Box::new(move |rs| { + let mut resp = GetLastFlushTsOfRegionResponse::new(); + resp.set_checkpoints( + rs.into_iter() + .map(|r| match r { + GetCheckpointResult::Ok { region, checkpoint } => { + let mut r = RegionCheckpoint::new(); + let id = id_of(®ion); + r.set_region(id); + r.set_checkpoint(checkpoint.into_inner()); + r + } + GetCheckpointResult::NotFound { id, err } => { + let mut r = RegionCheckpoint::new(); + r.set_region(id.into()); + r.set_err(err); + r + } + GetCheckpointResult::EpochNotMatch { region, err } => { + let mut r = RegionCheckpoint::new(); + r.set_region(id_of(®ion)); + r.set_err(err); + r + } + }) + .collect(), + ); + tokio::spawn(async { + if let Err(e) = sink.success(resp).await { + warn!("failed to reply grpc resonse."; "err" => %e) + } + }); + }), + )); + try_send!(self.endpoint, t); + } + + fn subscribe_flush_event( + &mut self, + _ctx: grpcio::RpcContext<'_>, + _req: kvproto::logbackuppb::SubscribeFlushEventRequest, + #[allow(unused_variables)] sink: grpcio::ServerStreamingSink< + kvproto::logbackuppb::SubscribeFlushEventResponse, + >, + ) { + #[cfg(test)] + panic!("Service should not be used in an unit test"); + #[cfg(not(test))] + try_send!( + self.endpoint, + Task::RegionCheckpointsOp(RegionCheckpointOperation::Subscribe(sink)) + ); + } +} diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs new file mode 100644 index 00000000000..bf1a5552f71 --- /dev/null +++ b/components/backup-stream/src/subscription_manager.rs @@ -0,0 +1,835 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; + +use crossbeam::channel::{Receiver as SyncReceiver, Sender as SyncSender}; +use crossbeam_channel::SendError; +use engine_traits::KvEngine; +use error_code::ErrorCodeExt; +use futures::FutureExt; +use kvproto::metapb::Region; +use pd_client::PdClient; +use raft::StateRole; +use raftstore::{ + coprocessor::{ObserveHandle, RegionInfoProvider}, + router::CdcHandle, + store::fsm::ChangeObserver, +}; +use tikv::storage::Statistics; +use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; +use tokio::sync::mpsc::{channel, Receiver, Sender}; +use txn_types::TimeStamp; +use yatp::task::callback::Handle as YatpHandle; + +use crate::{ + annotate, + endpoint::{BackupStreamResolver, ObserveOp}, + errors::{Error, Result}, + event_loader::InitialDataLoader, + future, + metadata::{store::MetaStore, CheckpointProvider, MetadataClient}, + metrics, + observer::BackupStreamObserver, + router::{Router, TaskSelector}, + subscription_track::{CheckpointType, ResolveResult, SubscriptionTracer}, + try_send, + utils::{self, CallbackWaitGroup, Work}, + Task, +}; + +type ScanPool = yatp::ThreadPool; + +const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; + +// The retry parameters for failed to get last checkpoint ts. +// When PD is temporarily disconnected, we may need this retry. +// The total duration of retrying is about 345s ( 20 * 16 + 15 ), +// which is longer than the RPO promise. +const TRY_START_OBSERVE_MAX_RETRY_TIME: u8 = 24; +const RETRY_AWAIT_BASIC_DURATION: Duration = Duration::from_secs(1); +const RETRY_AWAIT_MAX_DURATION: Duration = Duration::from_secs(16); + +fn backoff_for_start_observe(failed_for: u8) -> Duration { + Ord::min( + RETRY_AWAIT_BASIC_DURATION * (1 << failed_for), + RETRY_AWAIT_MAX_DURATION, + ) +} + +/// a request for doing initial scanning. +struct ScanCmd { + region: Region, + handle: ObserveHandle, + last_checkpoint: TimeStamp, + _work: Work, +} + +/// The response of requesting resolve the new checkpoint of regions. +pub struct ResolvedRegions { + items: Vec, + checkpoint: TimeStamp, +} + +impl ResolvedRegions { + /// Compose the calculated global checkpoint and region checkpoints. + /// Note: Maybe we can compute the global checkpoint internal and getting + /// the interface clear. However we must take the `min_ts` or we cannot + /// provide valid global checkpoint if there isn't any region checkpoint. + pub fn new(checkpoint: TimeStamp, checkpoints: Vec) -> Self { + Self { + items: checkpoints, + checkpoint, + } + } + + /// take the region checkpoints from the structure. + #[deprecated = "please use `take_resolve_result` instead."] + pub fn take_region_checkpoints(&mut self) -> Vec<(Region, TimeStamp)> { + std::mem::take(&mut self.items) + .into_iter() + .map(|x| (x.region, x.checkpoint)) + .collect() + } + + /// take the resolve result from this struct. + pub fn take_resolve_result(&mut self) -> Vec { + std::mem::take(&mut self.items) + } + + /// get the global checkpoint. + pub fn global_checkpoint(&self) -> TimeStamp { + self.checkpoint + } +} + +/// returns whether the error should be retried. +/// for some errors, like `epoch not match` or `not leader`, +/// implies that the region is drifting, and no more need to be observed by us. +fn should_retry(err: &Error) -> bool { + match err.without_context() { + Error::RaftRequest(pbe) => { + !(pbe.has_epoch_not_match() + || pbe.has_not_leader() + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) + } + Error::RaftStore(raftstore::Error::RegionNotFound(_)) + | Error::RaftStore(raftstore::Error::NotLeader(..)) + | Error::ObserveCanceled(..) + | Error::RaftStore(raftstore::Error::EpochNotMatch(..)) => false, + _ => true, + } +} + +/// the abstraction over a "DB" which provides the initial scanning. +trait InitialScan: Clone { + fn do_initial_scan( + &self, + region: &Region, + start_ts: TimeStamp, + handle: ObserveHandle, + ) -> Result; + + fn handle_fatal_error(&self, region: &Region, err: Error); +} + +impl InitialScan for InitialDataLoader +where + E: KvEngine, + R: RegionInfoProvider + Clone + 'static, + RT: CdcHandle, +{ + fn do_initial_scan( + &self, + region: &Region, + start_ts: TimeStamp, + handle: ObserveHandle, + ) -> Result { + let region_id = region.get_id(); + let h = handle.clone(); + // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep + // retrying here? + let snap = self.observe_over_with_retry(region, move || { + ChangeObserver::from_pitr(region_id, handle.clone()) + })?; + #[cfg(feature = "failpoints")] + fail::fail_point!("scan_after_get_snapshot"); + let stat = self.do_initial_scan(region, h, start_ts, snap)?; + Ok(stat) + } + + fn handle_fatal_error(&self, region: &Region, err: Error) { + try_send!( + self.scheduler, + Task::FatalError( + TaskSelector::ByRange( + region.get_start_key().to_owned(), + region.get_end_key().to_owned() + ), + Box::new(err), + ) + ); + } +} + +impl ScanCmd { + /// execute the initial scanning via the specificated [`InitialDataLoader`]. + fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { + let Self { + region, + handle, + last_checkpoint, + .. + } = self; + let begin = Instant::now_coarse(); + let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; + info!("initial scanning finished!"; "takes" => ?begin.saturating_elapsed(), "from_ts" => %last_checkpoint, utils::slog_region(region)); + utils::record_cf_stat("lock", &stat.lock); + utils::record_cf_stat("write", &stat.write); + utils::record_cf_stat("default", &stat.data); + Ok(()) + } + + /// execute the command, when meeting error, retrying. + fn exec_by_with_retry(self, init: impl InitialScan, cancel: &AtomicBool) { + let mut retry_time = INITIAL_SCAN_FAILURE_MAX_RETRY_TIME; + loop { + if cancel.load(Ordering::SeqCst) { + return; + } + match self.exec_by(init.clone()) { + Err(err) if should_retry(&err) && retry_time > 0 => { + // NOTE: blocking this thread may stick the process. + // Maybe spawn a task to tokio and reschedule the task then? + std::thread::sleep(Duration::from_millis(500)); + warn!("meet retryable error"; "err" => %err, "retry_time" => retry_time); + retry_time -= 1; + continue; + } + Err(err) if retry_time == 0 => { + init.handle_fatal_error(&self.region, err.context("retry time exceeds")); + break; + } + // Errors which `should_retry` returns false means they can be ignored. + Err(_) | Ok(_) => break, + } + } + } +} + +fn scan_executor_loop( + init: impl InitialScan, + cmds: SyncReceiver, + canceled: Arc, +) { + while let Ok(cmd) = cmds.recv() { + fail::fail_point!("execute_scan_command"); + debug!("handling initial scan request"; "region_id" => %cmd.region.get_id()); + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["queuing"]) + .dec(); + if canceled.load(Ordering::Acquire) { + return; + } + + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .inc(); + cmd.exec_by_with_retry(init.clone(), &canceled); + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .dec(); + } +} + +/// spawn the executors in the scan pool. +/// we make workers thread instead of spawn scan task directly into the pool +/// because the [`InitialDataLoader`] isn't `Sync` hence we must use it very +/// carefully or rustc (along with tokio) would complain that we made a `!Send` +/// future. so we have moved the data loader to the synchronous context so its +/// reference won't be shared between threads any more. +fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> ScanPoolHandle { + let (tx, rx) = crossbeam::channel::bounded(MESSAGE_BUFFER_SIZE); + let pool = create_scan_pool(number); + let stopped = Arc::new(AtomicBool::new(false)); + for _ in 0..number { + let init = init.clone(); + let rx = rx.clone(); + let stopped = stopped.clone(); + pool.spawn(move |_: &mut YatpHandle<'_>| { + tikv_alloc::add_thread_memory_accessor(); + let _io_guard = file_system::WithIoType::new(file_system::IoType::Replication); + scan_executor_loop(init, rx, stopped); + tikv_alloc::remove_thread_memory_accessor(); + }) + } + ScanPoolHandle { + tx, + _pool: pool, + stopped, + } +} + +struct ScanPoolHandle { + tx: SyncSender, + stopped: Arc, + + // in fact, we won't use the pool any more. + // but we should hold the reference to the pool so it won't try to join the threads running. + _pool: ScanPool, +} + +impl Drop for ScanPoolHandle { + fn drop(&mut self) { + self.stopped.store(true, Ordering::Release); + } +} + +impl ScanPoolHandle { + fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { + if self.stopped.load(Ordering::Acquire) { + warn!("scan pool is stopped, ignore the scan command"; "region" => %cmd.region.get_id()); + return Ok(()); + } + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["queuing"]) + .inc(); + self.tx.send(cmd) + } +} + +/// The default channel size. +const MESSAGE_BUFFER_SIZE: usize = 32768; + +/// The operator for region subscription. +/// It make a queue for operations over the `SubscriptionTracer`, generally, +/// we should only modify the `SubscriptionTracer` itself (i.e. insert records, +/// remove records) at here. So the order subscription / desubscription won't be +/// broken. +pub struct RegionSubscriptionManager { + // Note: these fields appear everywhere, maybe make them a `context` type? + regions: R, + meta_cli: MetadataClient, + pd_client: Arc, + range_router: Router, + scheduler: Scheduler, + observer: BackupStreamObserver, + subs: SubscriptionTracer, + + messenger: Sender, + scan_pool_handle: Arc, + scans: Arc, +} + +impl Clone for RegionSubscriptionManager +where + S: MetaStore + 'static, + R: RegionInfoProvider + Clone + 'static, + PDC: PdClient + 'static, +{ + fn clone(&self) -> Self { + Self { + regions: self.regions.clone(), + meta_cli: self.meta_cli.clone(), + // We should manually call Arc::clone here or rustc complains that `PDC` isn't `Clone`. + pd_client: Arc::clone(&self.pd_client), + range_router: self.range_router.clone(), + scheduler: self.scheduler.clone(), + observer: self.observer.clone(), + subs: self.subs.clone(), + messenger: self.messenger.clone(), + scan_pool_handle: self.scan_pool_handle.clone(), + scans: CallbackWaitGroup::new(), + } + } +} + +/// Create a yatp pool for doing initial scanning. +fn create_scan_pool(num_threads: usize) -> ScanPool { + yatp::Builder::new("log-backup-scan") + .max_thread_count(num_threads) + .build_callback_pool() +} + +impl RegionSubscriptionManager +where + S: MetaStore + 'static, + R: RegionInfoProvider + Clone + 'static, + PDC: PdClient + 'static, +{ + /// create a [`RegionSubscriptionManager`]. + /// + /// # returns + /// + /// a two-tuple, the first is the handle to the manager, the second is the + /// operator loop future. + pub fn start( + initial_loader: InitialDataLoader, + observer: BackupStreamObserver, + meta_cli: MetadataClient, + pd_client: Arc, + scan_pool_size: usize, + resolver: BackupStreamResolver, + ) -> (Self, future![()]) + where + E: KvEngine, + RT: CdcHandle + 'static, + { + let (tx, rx) = channel(MESSAGE_BUFFER_SIZE); + let scan_pool_handle = spawn_executors(initial_loader.clone(), scan_pool_size); + let op = Self { + regions: initial_loader.regions.clone(), + meta_cli, + pd_client, + range_router: initial_loader.sink.clone(), + scheduler: initial_loader.scheduler.clone(), + observer, + subs: initial_loader.tracing, + messenger: tx, + scan_pool_handle: Arc::new(scan_pool_handle), + scans: CallbackWaitGroup::new(), + }; + let fut = op.clone().region_operator_loop(rx, resolver); + (op, fut) + } + + /// send an operation request to the manager. + /// the returned future would be resolved after send is success. + /// the opeartion would be executed asynchronously. + pub async fn request(&self, op: ObserveOp) { + if let Err(err) = self.messenger.send(op).await { + annotate!(err, "BUG: region operator channel closed.") + .report("when executing region op"); + } + } + + /// wait initial scanning get finished. + pub fn wait(&self, timeout: Duration) -> future![bool] { + tokio::time::timeout(timeout, self.scans.wait()).map(|result| result.is_err()) + } + + /// the handler loop. + async fn region_operator_loop( + self, + mut message_box: Receiver, + mut resolver: BackupStreamResolver, + ) where + E: KvEngine, + RT: CdcHandle + 'static, + { + while let Some(op) = message_box.recv().await { + // Skip some trivial resolve commands. + if !matches!(op, ObserveOp::ResolveRegions { .. }) { + info!("backup stream: on_modify_observe"; "op" => ?op); + } + match op { + ObserveOp::Start { region } => { + fail::fail_point!("delay_on_start_observe"); + self.start_observe(region).await; + metrics::INITIAL_SCAN_REASON + .with_label_values(&["leader-changed"]) + .inc(); + } + ObserveOp::Stop { ref region } => { + self.subs.deregister_region_if(region, |_, _| true); + } + ObserveOp::Destroy { ref region } => { + self.subs.deregister_region_if(region, |old, new| { + raftstore::store::util::compare_region_epoch( + old.meta.get_region_epoch(), + new, + true, + true, + false, + ) + .map_err(|err| warn!("check epoch and stop failed."; utils::slog_region(region), "err" => %err)) + .is_ok() + }); + } + ObserveOp::RefreshResolver { ref region } => self.refresh_resolver(region).await, + ObserveOp::NotifyFailToStartObserve { + region, + handle, + err, + has_failed_for, + } => { + info!("retry observe region"; "region" => %region.get_id(), "err" => %err); + // No need for retrying observe canceled. + if err.error_code() == error_code::backup_stream::OBSERVE_CANCELED { + return; + } + let (start, end) = ( + region.get_start_key().to_owned(), + region.get_end_key().to_owned(), + ); + match self.retry_observe(region, handle, has_failed_for).await { + Ok(()) => {} + Err(e) => { + let msg = Task::FatalError( + TaskSelector::ByRange(start, end), + Box::new(Error::Contextual { + context: format!("retry meet error, origin error is {}", err), + inner_error: Box::new(e), + }), + ); + try_send!(self.scheduler, msg); + } + } + } + ObserveOp::ResolveRegions { callback, min_ts } => { + let now = Instant::now(); + let timedout = self.wait(Duration::from_secs(5)).await; + if timedout { + warn!("waiting for initial scanning done timed out, forcing progress!"; + "take" => ?now.saturating_elapsed(), "timedout" => %timedout); + } + let regions = resolver.resolve(self.subs.current_regions(), min_ts).await; + let cps = self.subs.resolve_with(min_ts, regions); + let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); + // If there isn't any region observed, the `min_ts` can be used as resolved ts + // safely. + let rts = min_region.map(|rs| rs.checkpoint).unwrap_or(min_ts); + if min_region + .map(|mr| mr.checkpoint_type != CheckpointType::MinTs) + .unwrap_or(false) + { + info!("getting non-trivial checkpoint"; "defined_by_region" => ?min_region); + } + callback(ResolvedRegions::new(rts, cps)); + } + } + } + } + + async fn refresh_resolver(&self, region: &Region) { + let need_refresh_all = !self.subs.try_update_region(region); + + if need_refresh_all { + let canceled = self.subs.deregister_region_if(region, |_, _| true); + let handle = ObserveHandle::new(); + if canceled { + if let Some(for_task) = self.find_task_by_region(region) { + metrics::INITIAL_SCAN_REASON + .with_label_values(&["region-changed"]) + .inc(); + let r = async { + self.subs.add_pending_region(region); + self.observe_over_with_initial_data_from_checkpoint( + region, + self.get_last_checkpoint_of(&for_task, region).await?, + handle.clone(), + ); + Result::Ok(()) + } + .await; + if let Err(e) = r { + try_send!( + self.scheduler, + Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { + region: region.clone(), + handle, + err: Box::new(e), + has_failed_for: 0, + }) + ); + } + } else { + warn!( + "BUG: the region {:?} is register to no task but being observed", + utils::debug_region(region) + ); + } + } + } + } + + async fn try_start_observe(&self, region: &Region, handle: ObserveHandle) -> Result<()> { + match self.find_task_by_region(region) { + None => { + warn!( + "the region {:?} is register to no task but being observed (start_key = {}; end_key = {}; task_stat = {:?}): maybe stale, aborting", + region, + utils::redact(®ion.get_start_key()), + utils::redact(®ion.get_end_key()), + self.range_router + ); + } + + Some(for_task) => { + // the extra failpoint is used to pause the thread. + // once it triggered "pause" it cannot trigger early return then. + fail::fail_point!("try_start_observe0"); + fail::fail_point!("try_start_observe", |_| { + Err(Error::Other(box_err!("Nature is boring"))) + }); + let tso = self.get_last_checkpoint_of(&for_task, region).await?; + self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()); + } + } + Ok(()) + } + + async fn start_observe(&self, region: Region) { + self.start_observe_with_failure_count(region, 0).await + } + + async fn start_observe_with_failure_count(&self, region: Region, has_failed_for: u8) { + let handle = ObserveHandle::new(); + let schd = self.scheduler.clone(); + self.subs.add_pending_region(®ion); + if let Err(err) = self.try_start_observe(®ion, handle.clone()).await { + warn!("failed to start observe, would retry"; "err" => %err, utils::slog_region(®ion)); + tokio::spawn(async move { + #[cfg(not(feature = "failpoints"))] + let delay = backoff_for_start_observe(has_failed_for); + #[cfg(feature = "failpoints")] + let delay = (|| { + fail::fail_point!("subscribe_mgr_retry_start_observe_delay", |v| { + let dur = v + .expect("should provide delay time (in ms)") + .parse::() + .expect("should be number (in ms)"); + Duration::from_millis(dur) + }); + backoff_for_start_observe(has_failed_for) + })(); + tokio::time::sleep(delay).await; + try_send!( + schd, + Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { + region, + handle, + err: Box::new(err), + has_failed_for: has_failed_for + 1 + }) + ) + }); + } + } + + async fn retry_observe( + &self, + region: Region, + handle: ObserveHandle, + failure_count: u8, + ) -> Result<()> { + if failure_count > TRY_START_OBSERVE_MAX_RETRY_TIME { + return Err(Error::Other( + format!( + "retry time exceeds for region {:?}", + utils::debug_region(®ion) + ) + .into(), + )); + } + + let (tx, rx) = crossbeam::channel::bounded(1); + self.regions + .find_region_by_id( + region.get_id(), + Box::new(move |item| { + tx.send(item) + .expect("BUG: failed to send to newly created channel."); + }), + ) + .map_err(|err| { + annotate!( + err, + "failed to send request to region info accessor, server maybe too too too busy. (region id = {})", + region.get_id() + ) + })?; + let new_region_info = rx + .recv() + .map_err(|err| annotate!(err, "BUG?: unexpected channel message dropped."))?; + if new_region_info.is_none() { + metrics::SKIP_RETRY + .with_label_values(&["region-absent"]) + .inc(); + return Ok(()); + } + let new_region_info = new_region_info.unwrap(); + if new_region_info.role != StateRole::Leader { + metrics::SKIP_RETRY.with_label_values(&["not-leader"]).inc(); + return Ok(()); + } + // Note: we may fail before we insert the region info to the subscription map. + // At that time, the command isn't steal and we should retry it. + let mut exists = false; + let removed = self.subs.deregister_region_if(®ion, |old, _| { + exists = true; + let should_remove = old.handle().id == handle.id; + if !should_remove { + warn!("stale retry command"; utils::slog_region(®ion), "handle" => ?handle, "old_handle" => ?old.handle()); + } + should_remove + }); + if !removed && exists { + metrics::SKIP_RETRY + .with_label_values(&["stale-command"]) + .inc(); + return Ok(()); + } + metrics::INITIAL_SCAN_REASON + .with_label_values(&["retry"]) + .inc(); + self.start_observe_with_failure_count(region, failure_count) + .await; + Ok(()) + } + + async fn get_last_checkpoint_of(&self, task: &str, region: &Region) -> Result { + fail::fail_point!("get_last_checkpoint_of", |hint| Err(Error::Other( + box_err!( + "get_last_checkpoint_of({}, {:?}) failed because {:?}", + task, + region, + hint + ) + ))); + let meta_cli = self.meta_cli.clone(); + let cp = meta_cli.get_region_checkpoint(task, region).await?; + debug!("got region checkpoint"; "region_id" => %region.get_id(), "checkpoint" => ?cp); + if matches!(cp.provider, CheckpointProvider::Global) { + metrics::STORE_CHECKPOINT_TS + .with_label_values(&[task]) + .set(cp.ts.into_inner() as _); + } + Ok(cp.ts) + } + + fn spawn_scan(&self, cmd: ScanCmd) { + // we should not spawn initial scanning tasks to the tokio blocking pool + // because it is also used for converting sync File I/O to async. (for now!) + // In that condition, if we blocking for some resources(for example, the + // `MemoryQuota`) at the block threads, we may meet some ghosty + // deadlock. + let s = self.scan_pool_handle.request(cmd); + if let Err(err) = s { + let region_id = err.0.region.get_id(); + annotate!(err, "BUG: scan_pool closed") + .report(format!("during initial scanning for region {}", region_id)); + } + } + + fn observe_over_with_initial_data_from_checkpoint( + &self, + region: &Region, + last_checkpoint: TimeStamp, + handle: ObserveHandle, + ) { + self.subs + .register_region(region, handle.clone(), Some(last_checkpoint)); + self.spawn_scan(ScanCmd { + region: region.clone(), + handle, + last_checkpoint, + _work: self.scans.clone().work(), + }) + } + + fn find_task_by_region(&self, r: &Region) -> Option { + self.range_router + .find_task_by_range(&r.start_key, &r.end_key) + } +} + +#[cfg(test)] +mod test { + use kvproto::metapb::Region; + use tikv::storage::Statistics; + + use super::InitialScan; + + #[derive(Clone, Copy)] + struct NoopInitialScan; + + impl InitialScan for NoopInitialScan { + fn do_initial_scan( + &self, + _region: &Region, + _start_ts: txn_types::TimeStamp, + _handle: raftstore::coprocessor::ObserveHandle, + ) -> crate::errors::Result { + Ok(Statistics::default()) + } + + fn handle_fatal_error(&self, region: &Region, err: crate::errors::Error) { + panic!("fatal {:?} {}", region, err) + } + } + + #[test] + #[cfg(feature = "failpoints")] + fn test_message_delay_and_exit() { + use std::time::Duration; + + use super::ScanCmd; + use crate::{subscription_manager::spawn_executors, utils::CallbackWaitGroup}; + + fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { + let (tx, rx) = futures::channel::oneshot::channel(); + std::thread::spawn(move || { + f(); + tx.send(()).unwrap(); + }); + let pool = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + let _e = pool.handle().enter(); + pool.block_on(tokio::time::timeout(d, rx)).unwrap().unwrap(); + } + + let pool = spawn_executors(NoopInitialScan, 1); + let wg = CallbackWaitGroup::new(); + fail::cfg("execute_scan_command", "sleep(100)").unwrap(); + for _ in 0..100 { + let wg = wg.clone(); + pool.request(ScanCmd { + region: Default::default(), + handle: Default::default(), + last_checkpoint: Default::default(), + // Note: Maybe make here a Box or some other trait? + _work: wg.work(), + }) + .unwrap() + } + + should_finish_in(move || drop(pool), Duration::from_secs(5)); + } + + #[test] + fn test_backoff_for_start_observe() { + assert_eq!( + super::backoff_for_start_observe(0), + super::RETRY_AWAIT_BASIC_DURATION + ); + assert_eq!( + super::backoff_for_start_observe(1), + super::RETRY_AWAIT_BASIC_DURATION * 2 + ); + assert_eq!( + super::backoff_for_start_observe(2), + super::RETRY_AWAIT_BASIC_DURATION * 4 + ); + assert_eq!( + super::backoff_for_start_observe(3), + super::RETRY_AWAIT_BASIC_DURATION * 8 + ); + assert_eq!( + super::backoff_for_start_observe(4), + super::RETRY_AWAIT_MAX_DURATION + ); + assert_eq!( + super::backoff_for_start_observe(5), + super::RETRY_AWAIT_MAX_DURATION + ); + } +} diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index f3852fe9782..7fee1b1b438 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -1,8 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{collections::HashSet, sync::Arc}; -use dashmap::{mapref::one::RefMut, DashMap}; +use dashmap::{ + mapref::{entry::Entry, one::RefMut as DashRefMut}, + DashMap, +}; use kvproto::metapb::Region; use raftstore::coprocessor::*; use resolved_ts::Resolver; @@ -13,15 +16,50 @@ use crate::{debug, metrics::TRACK_REGION, utils}; /// A utility to tracing the regions being subscripted. #[derive(Clone, Default, Debug)] -pub struct SubscriptionTracer(Arc>); +pub struct SubscriptionTracer(Arc>); + +/// The state of the subscription state machine: +/// Initial state is `ABSENT`, the subscription isn't in the tracer. +/// Once it becomes the leader, it would be in `PENDING` state, where we would +/// prepare the information needed for doing initial scanning. +/// When we are able to start execute initial scanning, it would be in `RUNNING` +/// state, where it starts to handle events. +/// You may notice there are also some state transforms in the +/// [`TwoPhaseResolver`] struct, states there are sub-states of the `RUNNING` +/// stage here. +enum SubscribeState { + // NOTE: shall we add `SubscriptionHandle` here? + // (So we can check this when calling `remove_if`.) + Pending(Region), + Running(ActiveSubscription), +} + +impl SubscribeState { + /// check whether the current state is pending. + fn is_pending(&self) -> bool { + matches!(self, SubscribeState::Pending(_)) + } +} -pub struct RegionSubscription { +impl std::fmt::Debug for SubscribeState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Pending(arg0) => f + .debug_tuple("Pending") + .field(&utils::debug_region(arg0)) + .finish(), + Self::Running(arg0) => f.debug_tuple("Running").field(arg0).finish(), + } + } +} + +pub struct ActiveSubscription { pub meta: Region, pub(crate) handle: ObserveHandle, - resolver: TwoPhaseResolver, + pub(crate) resolver: TwoPhaseResolver, } -impl std::fmt::Debug for RegionSubscription { +impl std::fmt::Debug for ActiveSubscription { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_tuple("RegionSubscription") .field(&self.meta.get_id()) @@ -30,7 +68,7 @@ impl std::fmt::Debug for RegionSubscription { } } -impl RegionSubscription { +impl ActiveSubscription { pub fn new(region: Region, handle: ObserveHandle, start_ts: Option) -> Self { let resolver = TwoPhaseResolver::new(region.get_id(), start_ts); Self { @@ -40,8 +78,8 @@ impl RegionSubscription { } } - pub fn stop_observing(&self) { - self.handle.stop_observing() + pub fn stop(&mut self) { + self.handle.stop_observing(); } pub fn is_observing(&self) -> bool { @@ -57,101 +95,202 @@ impl RegionSubscription { } } -impl SubscriptionTracer { - /// get the current safe point: data before this ts have already be flushed and be able to be GCed. - pub fn safepoint(&self) -> TimeStamp { - // use the current resolved_ts is safe because it is only advanced when flushing. - self.0 - .iter() - .map(|r| r.resolver.resolved_ts()) - .min() - // NOTE: Maybe use the current timestamp? - .unwrap_or(TimeStamp::zero()) +#[derive(PartialEq, Eq)] +pub enum CheckpointType { + MinTs, + StartTsOfInitialScan, + StartTsOfTxn(Option>), +} + +impl std::fmt::Debug for CheckpointType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::MinTs => write!(f, "MinTs"), + Self::StartTsOfInitialScan => write!(f, "StartTsOfInitialScan"), + Self::StartTsOfTxn(arg0) => f + .debug_tuple("StartTsOfTxn") + .field(&format_args!( + "{}", + utils::redact(&arg0.as_ref().map(|x| x.as_ref()).unwrap_or(&[])) + )) + .finish(), + } + } +} + +pub struct ResolveResult { + pub region: Region, + pub checkpoint: TimeStamp, + pub checkpoint_type: CheckpointType, +} + +impl std::fmt::Debug for ResolveResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResolveResult") + .field("region", &self.region.get_id()) + .field("checkpoint", &self.checkpoint) + .field("checkpoint_type", &self.checkpoint_type) + .finish() } +} +impl ResolveResult { + fn resolve(sub: &mut ActiveSubscription, min_ts: TimeStamp) -> Self { + let ts = sub.resolver.resolve(min_ts); + let ty = if ts == min_ts { + CheckpointType::MinTs + } else if sub.resolver.in_phase_one() { + CheckpointType::StartTsOfInitialScan + } else { + CheckpointType::StartTsOfTxn(sub.resolver.sample_far_lock()) + }; + Self { + region: sub.meta.clone(), + checkpoint: ts, + checkpoint_type: ty, + } + } +} + +impl SubscriptionTracer { /// clear the current `SubscriptionTracer`. pub fn clear(&self) { self.0.retain(|_, v| { - v.stop_observing(); - TRACK_REGION.with_label_values(&["dec"]).inc(); + if let SubscribeState::Running(s) = v { + s.stop(); + TRACK_REGION.dec(); + } false }); } + /// Add a pending region into the tracker. + /// A `PENDING` region is a region we are going to start subscribe however + /// there are still tiny impure things need to do. (e.g. getting the + /// checkpoint of this region.) + /// + /// This state is a placeholder for those regions: once they failed in the + /// impure operations, this would be the evidence proofing they were here. + /// + /// So we can do better when we are doing refreshing, say: + /// ```no_run + /// match task { + /// Task::RefreshObserve(r) if is_pending(r) => { /* Execute the refresh. */ } + /// Task::RefreshObserve(r) if is_absent(r) => { /* Do nothing. Maybe stale. */ } + /// } + /// ``` + /// + /// We should execute the refresh when it is pending, because the start may + /// fail and then a refresh fires. + /// We should skip when we are going to refresh absent regions because there + /// may be some stale commands. + pub fn add_pending_region(&self, region: &Region) { + let r = self + .0 + .insert(region.get_id(), SubscribeState::Pending(region.clone())); + if let Some(s) = r { + warn!( + "excepted state transform: running | pending -> pending"; + "old" => ?s, utils::slog_region(region), + ) + } + } + // Register a region as tracing. // The `start_ts` is used to tracking the progress of initial scanning. - // (Note: the `None` case of `start_ts` is for testing / refresh region status when split / merge, - // maybe we'd better provide some special API for those cases and remove the `Option`?) + // Note: the `None` case of `start_ts` is for testing / refresh region status + // when split / merge, maybe we'd better provide some special API for those + // cases and remove the `Option`? pub fn register_region( &self, region: &Region, handle: ObserveHandle, start_ts: Option, ) { - info!("start listen stream from store"; "observer" => ?handle, "region_id" => %region.get_id()); - TRACK_REGION.with_label_values(&["inc"]).inc(); - if let Some(o) = self.0.insert( - region.get_id(), - RegionSubscription::new(region.clone(), handle, start_ts), - ) { - TRACK_REGION.with_label_values(&["dec"]).inc(); - warn!("register region which is already registered"; "region_id" => %region.get_id()); - o.stop_observing(); + info!("start listen stream from store"; "observer" => ?handle); + TRACK_REGION.inc(); + let e = self.0.entry(region.id); + match e { + Entry::Occupied(o) => { + let sub = ActiveSubscription::new(region.clone(), handle, start_ts); + let (_, s) = o.replace_entry(SubscribeState::Running(sub)); + if !s.is_pending() { + // If there is another subscription already (perhaps repeated Start), + // don't add the counter. + warn!("excepted state transform: running -> running"; "old" => ?s, utils::slog_region(region)); + TRACK_REGION.dec(); + } + } + Entry::Vacant(e) => { + warn!("excepted state transform: absent -> running"; utils::slog_region(region)); + let sub = ActiveSubscription::new(region.clone(), handle, start_ts); + e.insert(SubscribeState::Running(sub)); + } } } + pub fn current_regions(&self) -> Vec { + self.0.iter().map(|s| *s.key()).collect() + } + /// try advance the resolved ts with the min ts of in-memory locks. - pub fn resolve_with(&self, min_ts: TimeStamp) -> TimeStamp { + /// returns the regions and theirs resolved ts. + pub fn resolve_with( + &self, + min_ts: TimeStamp, + regions: impl IntoIterator, + ) -> Vec { + let rs = regions.into_iter().collect::>(); self.0 .iter_mut() - .map(|mut s| s.resolver.resolve(min_ts)) - .min() - // If there isn't any region observed, the `min_ts` can be used as resolved ts safely. - .unwrap_or(min_ts) - } - - #[inline(always)] - pub fn warn_if_gap_too_huge(&self, ts: TimeStamp) { - let gap = TimeStamp::physical_now() - ts.physical(); - if gap >= 10 * 60 * 1000 - /* 10 mins */ - { - let far_resolver = self - .0 - .iter() - .min_by_key(|r| r.value().resolver.resolved_ts()); - warn!("log backup resolver ts advancing too slow"; - "far_resolver" => %{match far_resolver { - Some(r) => format!("{:?}", r.value().resolver), - None => "BUG[NoResolverButResolvedTSDoesNotAdvance]".to_owned() - }}, - "gap" => ?Duration::from_millis(gap), - ); - } + // Don't advance the checkpoint ts of pending region. + .filter_map(|mut s| { + let region_id = *s.key(); + match s.value_mut() { + SubscribeState::Running(sub) => { + let contains = rs.contains(®ion_id); + if !contains { + crate::metrics::MISC_EVENTS.skip_resolve_non_leader.inc(); + } + contains.then(|| ResolveResult::resolve(sub, min_ts)) + } + SubscribeState::Pending(r) => {warn!("pending region, skip resolving"; utils::slog_region(r)); None}, + } + }) + .collect() } /// try to mark a region no longer be tracked by this observer. - /// returns whether success (it failed if the region hasn't been observed when calling this.) - pub fn deregister_region( + /// returns whether success (it failed if the region hasn't been observed + /// when calling this.) + pub fn deregister_region_if( &self, region: &Region, - if_cond: impl FnOnce(&RegionSubscription, &Region) -> bool, + if_cond: impl FnOnce(&ActiveSubscription, &Region) -> bool, ) -> bool { let region_id = region.get_id(); - let remove_result = self - .0 - .remove_if(®ion_id, |_, old_region| if_cond(old_region, region)); + let remove_result = self.0.entry(region_id); match remove_result { - Some(o) => { - TRACK_REGION.with_label_values(&["dec"]).inc(); - o.1.stop_observing(); - info!("stop listen stream from store"; "observer" => ?o.1, "region_id"=> %region_id); - true - } - None => { - warn!("trying to deregister region not registered"; "region_id" => %region_id); - false - } + Entry::Vacant(_) => false, + Entry::Occupied(mut o) => match o.get_mut() { + SubscribeState::Pending(r) => { + info!("remove pending subscription"; "region_id"=> %region_id, utils::slog_region(r)); + + o.remove(); + true + } + SubscribeState::Running(s) => { + if if_cond(s, region) { + TRACK_REGION.dec(); + s.stop(); + info!("stop listen stream from store"; "observer" => ?s, "region_id"=> %region_id); + + o.remove(); + return true; + } + false + } + }, } } @@ -159,13 +298,14 @@ impl SubscriptionTracer { /// /// # return /// - /// Whether the status can be updated internally without deregister-and-register. + /// Whether the status can be updated internally without + /// deregister-and-register. pub fn try_update_region(&self, new_region: &Region) -> bool { let mut sub = match self.get_subscription_of(new_region.get_id()) { Some(sub) => sub, None => { - warn!("backup stream observer refreshing void subscription."; "new_region" => ?new_region); - return true; + warn!("backup stream observer refreshing pending / absent subscription."; utils::slog_region(new_region)); + return false; } }; @@ -183,32 +323,87 @@ impl SubscriptionTracer { /// check whether the region_id should be observed by this observer. pub fn is_observing(&self, region_id: u64) -> bool { - let mut exists = false; - - // The region traced, check it whether is still be observing, - // if not, remove it. - let still_observing = self - .0 - // Assuming this closure would be called iff the key exists. - // So we can elide a `contains` check. - .remove_if(®ion_id, |_, o| { - exists = true; - !o.is_observing() - }) - .is_none(); - exists && still_observing + let sub = self.0.get_mut(®ion_id); + match sub { + Some(mut s) => match s.value_mut() { + SubscribeState::Pending(_) => false, + SubscribeState::Running(s) => s.is_observing(), + }, + None => false, + } } pub fn get_subscription_of( &self, region_id: u64, - ) -> Option> { - self.0.get_mut(®ion_id) + ) -> Option + '_> { + self.0 + .get_mut(®ion_id) + .and_then(|x| SubscriptionRef::try_from_dash(x)) } } -/// This enhanced version of `Resolver` allow some unorder of lock events. -/// The name "2-phase" means this is used for 2 *concurrency* phases of observing a region: +pub trait Ref { + type Key; + type Value; + + fn key(&self) -> &Self::Key; + fn value(&self) -> &Self::Value; +} + +pub trait RefMut: Ref { + fn value_mut(&mut self) -> &mut ::Value; +} + +impl<'a> Ref for SubscriptionRef<'a> { + type Key = u64; + type Value = ActiveSubscription; + + fn key(&self) -> &Self::Key { + DashRefMut::key(&self.0) + } + + fn value(&self) -> &Self::Value { + self.sub() + } +} + +impl<'a> RefMut for SubscriptionRef<'a> { + fn value_mut(&mut self) -> &mut ::Value { + self.sub_mut() + } +} + +struct SubscriptionRef<'a>(DashRefMut<'a, u64, SubscribeState>); + +impl<'a> SubscriptionRef<'a> { + fn try_from_dash(mut d: DashRefMut<'a, u64, SubscribeState>) -> Option { + match d.value_mut() { + SubscribeState::Pending(_) => None, + SubscribeState::Running(_) => Some(Self(d)), + } + } + + fn sub(&self) -> &ActiveSubscription { + match self.0.value() { + // Panic Safety: the constructor would prevent us from creating pending subscription + // ref. + SubscribeState::Pending(_) => unreachable!(), + SubscribeState::Running(s) => s, + } + } + + fn sub_mut(&mut self) -> &mut ActiveSubscription { + match self.0.value_mut() { + SubscribeState::Pending(_) => unreachable!(), + SubscribeState::Running(s) => s, + } + } +} + +/// This enhanced version of `Resolver` allow some unordered lock events. +/// The name "2-phase" means this is used for 2 *concurrency* phases of +/// observing a region: /// 1. Doing the initial scanning. /// 2. Listening at the incremental data. /// @@ -216,29 +411,35 @@ impl SubscriptionTracer { /// +->(Start TS Of Task) +->(Task registered to KV) /// +--------------------------------+------------------------> /// ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ ^~~~~~~~~~~~~~~~~~~~~~~~~ -/// | +-> Phase 2: Listening incremtnal data. +/// | +-> Phase 2: Listening incremental data. /// +-> Phase 1: Initial scanning scans writes between start ts and now. /// ``` /// -/// In backup-stream, we execute these two tasks parallelly. Which may make some race conditions: -/// - When doing initial scanning, there may be a flush triggered, but the defult resolver -/// would probably resolved to the tip of incremental events. -/// - When doing initial scanning, we meet and track a lock already meet by the incremental events, -/// then the default resolver cannot untrack this lock any more. +/// In backup-stream, we execute these two tasks parallel. Which may make some +/// race conditions: +/// - When doing initial scanning, there may be a flush triggered, but the +/// default resolver would probably resolved to the tip of incremental events. +/// - When doing initial scanning, we meet and track a lock already meet by the +/// incremental events, then the default resolver cannot untrack this lock any +/// more. /// -/// This version of resolver did some change for solve these problmes: -/// - The resolver won't advance the resolved ts to greater than `stable_ts` if there is some. This -/// can help us prevent resolved ts from advancing when initial scanning hasn't finished yet. -/// - When we `untrack` a lock haven't been tracked, this would record it, and skip this lock if we want to track it then. -/// This would be safe because: +/// This version of resolver did some change for solve these problems: +/// - The resolver won't advance the resolved ts to greater than `stable_ts` if +/// there is some. This can help us prevent resolved ts from advancing when +/// initial scanning hasn't finished yet. +/// - When we `untrack` a lock haven't been tracked, this would record it, and +/// skip this lock if we want to track it then. This would be safe because: /// - untracking a lock not be tracked is no-op for now. -/// - tracking a lock have already being untracked (unordered call of `track` and `untrack`) wouldn't happen at phase 2 for same region. -/// but only when phase 1 and phase 2 happend concurrently, at that time, we wouldn't and cannot advance the resolved ts. +/// - tracking a lock have already being untracked (unordered call of `track` +/// and `untrack`) wouldn't happen at phase 2 for same region. but only when +/// phase 1 and phase 2 happened concurrently, at that time, we wouldn't and +/// cannot advance the resolved ts. pub struct TwoPhaseResolver { resolver: Resolver, future_locks: Vec, /// When `Some`, is the start ts of the initial scanning. - /// And implies the phase 1 (initial scanning) is keep running asynchronously. + /// And implies the phase 1 (initial scanning) is keep running + /// asynchronously. stable_ts: Option, } @@ -264,6 +465,12 @@ impl std::fmt::Debug for FutureLock { } impl TwoPhaseResolver { + /// try to get one of the key of the oldest lock in the resolver. + pub fn sample_far_lock(&self) -> Option> { + let (_, keys) = self.resolver.locks().first_key_value()?; + keys.iter().next().cloned() + } + pub fn in_phase_one(&self) -> bool { self.stable_ts.is_some() } @@ -328,7 +535,18 @@ impl TwoPhaseResolver { for lock in std::mem::take(&mut self.future_locks).into_iter() { self.handle_future_lock(lock); } - self.stable_ts = None + let ts = self.stable_ts.take(); + match ts { + Some(ts) => { + // advance the internal resolver. + // the start ts of initial scanning would be a safe ts for min ts + // -- because is used to be a resolved ts. + self.resolver.resolve(ts); + } + None => { + warn!("BUG: a two-phase resolver is executing phase_one_done when not in phase one"; "resolver" => ?self) + } + } } } @@ -343,9 +561,14 @@ impl std::fmt::Debug for TwoPhaseResolver { #[cfg(test)] mod test { + use std::sync::Arc; + + use kvproto::metapb::{Region, RegionEpoch}; + use raftstore::coprocessor::ObserveHandle; use txn_types::TimeStamp; - use super::TwoPhaseResolver; + use super::{SubscriptionTracer, TwoPhaseResolver}; + use crate::subscription_track::RefMut; #[test] fn test_two_phase_resolver() { @@ -372,4 +595,81 @@ mod test { r.untrack_lock(&key[..]); assert_eq!(r.resolve(ts(57)), ts(57)); } + + fn region(id: u64, version: u64, conf_version: u64) -> Region { + let mut r = Region::new(); + let mut e = RegionEpoch::new(); + e.set_version(version); + e.set_conf_ver(conf_version); + r.set_id(id); + r.set_region_epoch(e); + r + } + + #[test] + fn test_delay_remove() { + let subs = SubscriptionTracer::default(); + let handle = ObserveHandle::new(); + subs.register_region(®ion(1, 1, 1), handle, Some(TimeStamp::new(42))); + assert!(subs.get_subscription_of(1).is_some()); + assert!(subs.is_observing(1)); + subs.deregister_region_if(®ion(1, 1, 1), |_, _| true); + assert!(!subs.is_observing(1)); + } + + #[test] + fn test_cal_checkpoint() { + let subs = SubscriptionTracer::default(); + subs.register_region( + ®ion(1, 1, 1), + ObserveHandle::new(), + Some(TimeStamp::new(42)), + ); + subs.register_region(®ion(2, 2, 1), ObserveHandle::new(), None); + subs.register_region( + ®ion(3, 4, 1), + ObserveHandle::new(), + Some(TimeStamp::new(88)), + ); + subs.get_subscription_of(3) + .unwrap() + .value_mut() + .resolver + .phase_one_done(); + subs.register_region( + ®ion(4, 8, 1), + ObserveHandle::new(), + Some(TimeStamp::new(92)), + ); + let mut region4_sub = subs.get_subscription_of(4).unwrap(); + region4_sub.value_mut().resolver.phase_one_done(); + region4_sub + .value_mut() + .resolver + .track_lock(TimeStamp::new(128), b"Alpi".to_vec()); + subs.register_region(®ion(5, 8, 1), ObserveHandle::new(), None); + subs.deregister_region_if(®ion(5, 8, 1), |_, _| true); + drop(region4_sub); + + let mut rs = subs + .resolve_with(TimeStamp::new(1000), vec![1, 2, 3, 4]) + .into_iter() + .map(|r| (r.region, r.checkpoint, r.checkpoint_type)) + .collect::>(); + rs.sort_by_key(|k| k.0.get_id()); + use crate::subscription_track::CheckpointType::*; + assert_eq!( + rs, + vec![ + (region(1, 1, 1), 42.into(), StartTsOfInitialScan), + (region(2, 2, 1), 1000.into(), MinTs), + (region(3, 4, 1), 1000.into(), MinTs), + ( + region(4, 8, 1), + 128.into(), + StartTsOfTxn(Some(Arc::from(b"Alpi".as_slice()))) + ), + ] + ); + } } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index c104a100b56..d94ba59b2d5 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -1,24 +1,53 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use core::pin::Pin; use std::{ borrow::Borrow, + cell::RefCell, collections::{hash_map::RandomState, BTreeMap, HashMap}, ops::{Bound, RangeBounds}, + path::Path, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + task::Context, time::Duration, }; +use async_compression::{tokio::write::ZstdEncoder, Level}; +use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{channel::mpsc, executor::block_on, StreamExt}; -use kvproto::raft_cmdpb::{CmdType, Request}; +use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; +use kvproto::{ + brpb::CompressionType, + metapb::Region, + raft_cmdpb::{CmdType, Request}, +}; use raft::StateRole; use raftstore::{coprocessor::RegionInfoProvider, RegionInfo}; use tikv::storage::CfStatistics; -use tikv_util::{box_err, time::Instant, warn, worker::Scheduler, Either}; -use tokio::sync::{Mutex, RwLock}; +use tikv_util::{ + box_err, + sys::inspector::{ + self_thread_inspector, IoStat, ThreadInspector, ThreadInspectorImpl as OsInspector, + }, + time::Instant, + warn, + worker::Scheduler, + Either, +}; +use tokio::{ + fs::File, + io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}, + sync::{oneshot, Mutex, RwLock}, +}; use txn_types::{Key, Lock, LockType}; use crate::{ errors::{Error, Result}, + metadata::store::BoxFuture, + router::TaskSelector, Task, }; @@ -30,8 +59,9 @@ pub fn wrap_key(v: Vec) -> Vec { } /// Transform a str to a [`engine_traits::CfName`]\(`&'static str`). -/// If the argument isn't one of `""`, `"DEFAULT"`, `"default"`, `"WRITE"`, `"write"`, `"LOCK"`, `"lock"`... -/// returns "ERR_CF". (Which would be ignored then.) +/// If the argument isn't one of `""`, `"DEFAULT"`, `"default"`, `"WRITE"`, +/// `"write"`, `"LOCK"`, `"lock"`... returns "ERR_CF". (Which would be ignored +/// then.) pub fn cf_name(s: &str) -> CfName { match s { "" | "DEFAULT" | "default" => CF_DEFAULT, @@ -114,7 +144,7 @@ pub struct StopWatch(Instant); impl StopWatch { /// Create a new stopwatch via current time. - pub fn new() -> Self { + pub fn by_now() -> Self { Self(Instant::now_coarse()) } @@ -133,7 +163,8 @@ pub type Slot = Mutex; /// NOTE: Maybe we can use dashmap for replacing the RwLock. pub type SlotMap = RwLock, S>>; -/// Like `..=val`(a.k.a. `RangeToInclusive`), but allows `val` being a reference to DSTs. +/// Like `..=val`(a.k.a. `RangeToInclusive`), but allows `val` being a reference +/// to DSTs. struct RangeToInclusiveRef<'a, T: ?Sized>(&'a T); impl<'a, T: ?Sized> RangeBounds for RangeToInclusiveRef<'a, T> { @@ -175,7 +206,8 @@ pub type SegmentSet = SegmentMap; impl SegmentMap { /// Try to add a element into the segment tree, with default value. - /// (This is useful when using the segment tree as a `Set`, i.e. `SegmentMap`) + /// (This is useful when using the segment tree as a `Set`, i.e. + /// `SegmentMap`) /// /// - If no overlapping, insert the range into the tree and returns `true`. /// - If overlapping detected, do nothing and return `false`. @@ -251,8 +283,8 @@ impl SegmentMap { return Some(overlap_with_start); } // |--s----+-----+----e----| - // Otherwise, the possibility of being overlapping would be there are some sub range - // of the queried range... + // Otherwise, the possibility of being overlapping would be there are some sub + // range of the queried range... // |--s----+----e----+-----| // ...Or the end key is contained by some Range. // For faster query, we merged the two cases together. @@ -270,7 +302,8 @@ impl SegmentMap { covered_by_the_range.map(|(k, v)| (k, &v.range_end, &v.item)) } - /// Check whether the range is overlapping with any range in the segment tree. + /// Check whether the range is overlapping with any range in the segment + /// tree. pub fn is_overlapping(&self, range: (&R, &R)) -> bool where K: Borrow, @@ -282,11 +315,15 @@ impl SegmentMap { pub fn get_inner(&mut self) -> &mut BTreeMap> { &mut self.0 } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } /// transform a [`RaftCmdRequest`] to `(key, value, cf)` triple. -/// once it contains a write request, extract it, and return `Left((key, value, cf))`, -/// otherwise return the request itself via `Right`. +/// once it contains a write request, extract it, and return `Left((key, value, +/// cf))`, otherwise return the request itself via `Right`. pub fn request_to_triple(mut req: Request) -> Either<(Vec, Vec, CfName), Request> { let (key, value, cf) = match req.get_cmd_type() { CmdType::Put => { @@ -303,11 +340,11 @@ pub fn request_to_triple(mut req: Request) -> Either<(Vec, Vec, CfName), } /// `try_send!(s: Scheduler, task: T)` tries to send a task to the scheduler, -/// once meet an error, would report it, with the current file and line (so it is made as a macro). -/// returns whether it success. +/// once meet an error, would report it, with the current file and line (so it +/// is made as a macro). returns whether it success. #[macro_export(crate)] macro_rules! try_send { - ($s: expr, $task: expr) => { + ($s:expr, $task:expr) => { match $s.schedule($task) { Err(err) => { $crate::errors::Error::from(err).report(concat!( @@ -325,9 +362,10 @@ macro_rules! try_send { }; } -/// a hacky macro which allow us enable all debug log via the feature `backup_stream_debug`. -/// because once we enable debug log for all crates, it would soon get too verbose to read. -/// using this macro now we can enable debug log level for the crate only (even compile time...). +/// a hacky macro which allow us enable all debug log via the feature +/// `backup_stream_debug`. because once we enable debug log for all crates, it +/// would soon get too verbose to read. using this macro now we can enable debug +/// log level for the crate only (even compile time...). #[macro_export(crate)] macro_rules! debug { ($($t: tt)+) => { @@ -375,14 +413,15 @@ pub fn record_cf_stat(cf_name: &str, stat: &CfStatistics) { ); } -/// a shortcut for handing the result return from `Router::on_events`, when any faliure, send a fatal error to the `doom_messenger`. +/// a shortcut for handing the result return from `Router::on_events`, when any +/// failure, send a fatal error to the `doom_messenger`. pub fn handle_on_event_result(doom_messenger: &Scheduler, result: Vec<(String, Result<()>)>) { for (task, res) in result.into_iter() { if let Err(err) = res { try_send!( doom_messenger, Task::FatalError( - task, + TaskSelector::ByName(task), Box::new(err.context("failed to record event to local temporary files")) ) ); @@ -401,9 +440,509 @@ pub fn should_track_lock(l: &Lock) -> bool { } } +pub struct CallbackWaitGroup { + running: AtomicUsize, + on_finish_all: std::sync::Mutex>>, +} + +impl CallbackWaitGroup { + pub fn new() -> Arc { + Arc::new(Self { + running: AtomicUsize::new(0), + on_finish_all: std::sync::Mutex::default(), + }) + } + + fn work_done(&self) { + let last = self.running.fetch_sub(1, Ordering::SeqCst); + if last == 1 { + self.on_finish_all + .lock() + .unwrap() + .drain(..) + .for_each(|x| x()) + } + } + + /// wait until all running tasks done. + pub fn wait(&self) -> BoxFuture<()> { + // Fast path: no uploading. + if self.running.load(Ordering::SeqCst) == 0 { + return Box::pin(futures::future::ready(())); + } + + let (tx, rx) = oneshot::channel(); + self.on_finish_all.lock().unwrap().push(Box::new(move || { + // The waiter may timed out. + let _ = tx.send(()); + })); + // try to acquire the lock again. + if self.running.load(Ordering::SeqCst) == 0 { + return Box::pin(futures::future::ready(())); + } + Box::pin(rx.map(|_| ())) + } + + /// make a work, as long as the return value held, mark a work in the group + /// is running. + pub fn work(self: Arc) -> Work { + self.running.fetch_add(1, Ordering::SeqCst); + Work(self) + } +} + +pub struct Work(Arc); + +impl Drop for Work { + fn drop(&mut self) { + self.0.work_done(); + } +} + +struct ReadThroughputRecorder { + // The system tool set. + ins: Option, + begin: Option, + // Once the system tool set get unavailable, + // we would use the "ejector" -- RocksDB perf context. + // NOTE: In fact I'm not sure whether we need the result of system level tool set -- + // but this is the current implement of cdc. We'd better keep consistent with them. + ejector: ReadPerfInstant, +} + +impl ReadThroughputRecorder { + fn start() -> Self { + let r = self_thread_inspector().ok().and_then(|insp| { + let stat = insp.io_stat().ok()??; + Some((insp, stat)) + }); + match r { + Some((ins, begin)) => Self { + ins: Some(ins), + begin: Some(begin), + ejector: ReadPerfInstant::new(), + }, + _ => Self { + ins: None, + begin: None, + ejector: ReadPerfInstant::new(), + }, + } + } + + fn try_get_delta_from_unix(&self) -> Option { + let ins = self.ins.as_ref()?; + let begin = self.begin.as_ref()?; + let end = ins.io_stat().ok()??; + let bytes_read = end.read - begin.read; + // FIXME: In our test environment, there may be too many caches hence the + // `bytes_read` is always zero. + // For now, we eject here and let rocksDB prove that we did read something when + // the proc think we don't touch the block device (even in fact we didn't). + // NOTE: In the real-world, we would accept the zero `bytes_read` value since + // the cache did exists. + #[cfg(test)] + if bytes_read == 0 { + // use println here so we can get this message even log doesn't enabled. + println!("ejecting in test since no read recorded in procfs"); + return None; + } + Some(bytes_read) + } + + fn end(self) -> u64 { + self.try_get_delta_from_unix() + .unwrap_or_else(|| self.ejector.delta().block_read_byte) + } +} + +/// try to record read throughput. +/// this uses the `proc` fs in the linux for recording the throughput. +/// if that failed, we would use the RocksDB perf context. +pub fn with_record_read_throughput(f: impl FnOnce() -> T) -> (T, u64) { + let recorder = ReadThroughputRecorder::start(); + let r = f(); + (r, recorder.end()) +} + +/// test whether a key is in the range. +/// end key is exclusive. +/// empty end key means infinity. +pub fn is_in_range(key: &[u8], range: (&[u8], &[u8])) -> bool { + match range { + (start, b"") => key >= start, + (start, end) => key >= start && key < end, + } +} + +/// test whether two ranges overlapping. +/// end key is exclusive. +/// empty end key means infinity. +pub fn is_overlapping(range: (&[u8], &[u8]), range2: (&[u8], &[u8])) -> bool { + let (x1, y1) = range; + let (x2, y2) = range2; + match (x1, y1, x2, y2) { + // 1: |__________________| + // 2: |______________________| + (_, b"", _, b"") => true, + // 1: (x1)|__________________| + // 2: |_________________|(y2) + (x1, b"", _, y2) => x1 < y2, + // 1: |________________|(y1) + // 2: (x2)|_________________| + (_, y1, x2, b"") => x2 < y1, + // 1: (x1)|________|(y1) + // 2: (x2)|__________|(y2) + (x1, y1, x2, y2) => x2 < y1 && x1 < y2, + } +} + +/// read files asynchronously in sequence +pub struct FilesReader { + files: Vec, + index: usize, +} + +impl FilesReader { + pub fn new(files: Vec) -> Self { + FilesReader { files, index: 0 } + } +} + +impl AsyncRead for FilesReader { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + let me = self.get_mut(); + + while me.index < me.files.len() { + let rem = buf.remaining(); + ready!(Pin::new(&mut me.files[me.index]).poll_read(cx, buf))?; + if buf.remaining() == rem { + me.index += 1; + } else { + return Poll::Ready(Ok(())); + } + } + + Poll::Ready(Ok(())) + } +} + +/// a wrapper for different compression type +#[async_trait::async_trait] +pub trait CompressionWriter: AsyncWrite + Sync + Send { + /// call the `File.sync_all()` to flush immediately to disk. + async fn done(mut self: Pin<&mut Self>) -> Result<()>; +} + +/// a writer dispatcher for different compression type. +/// regard `Compression::Unknown` as uncompressed type +/// to be compatible with v6.2.0. +pub async fn compression_writer_dispatcher( + local_path: impl AsRef, + compression_type: CompressionType, +) -> Result>> { + let inner = BufWriter::with_capacity(128 * 1024, File::create(local_path.as_ref()).await?); + match compression_type { + CompressionType::Unknown => Ok(Box::pin(NoneCompressionWriter::new(inner))), + CompressionType::Zstd => Ok(Box::pin(ZstdCompressionWriter::new(inner))), + _ => Err(Error::Other(box_err!(format!( + "the compression type is unimplemented, compression type id {:?}", + compression_type + )))), + } +} + +/// uncompressed type writer +pub struct NoneCompressionWriter { + inner: BufWriter, +} + +impl NoneCompressionWriter { + pub fn new(inner: BufWriter) -> Self { + NoneCompressionWriter { inner } + } +} + +impl AsyncWrite for NoneCompressionWriter { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + src: &[u8], + ) -> Poll> { + let me = self.get_mut(); + Pin::new(&mut me.inner).poll_write(cx, src) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_shutdown(cx) + } +} + +#[async_trait::async_trait] +impl CompressionWriter for NoneCompressionWriter { + async fn done(mut self: Pin<&mut Self>) -> Result<()> { + let bufwriter = &mut self.inner; + bufwriter.flush().await?; + bufwriter.get_ref().sync_all().await?; + Ok(()) + } +} + +/// use zstd compression algorithm +pub struct ZstdCompressionWriter { + inner: ZstdEncoder>, +} + +impl ZstdCompressionWriter { + pub fn new(inner: BufWriter) -> Self { + ZstdCompressionWriter { + inner: ZstdEncoder::with_quality(inner, Level::Fastest), + } + } +} + +impl AsyncWrite for ZstdCompressionWriter { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + src: &[u8], + ) -> Poll> { + let me = self.get_mut(); + Pin::new(&mut me.inner).poll_write(cx, src) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_shutdown(cx) + } +} + +#[async_trait::async_trait] +impl CompressionWriter for ZstdCompressionWriter { + async fn done(mut self: Pin<&mut Self>) -> Result<()> { + let encoder = &mut self.inner; + encoder.shutdown().await?; + let bufwriter = encoder.get_mut(); + bufwriter.flush().await?; + bufwriter.get_ref().sync_all().await?; + Ok(()) + } +} + +/// make a pair of key range to impl Debug which prints [start_key,$end_key). +pub fn debug_key_range<'ret, 'a: 'ret, 'b: 'ret>( + start: &'a [u8], + end: &'b [u8], +) -> impl std::fmt::Debug + 'ret { + DebugKeyRange::<'a, 'b>(start, end) +} + +struct DebugKeyRange<'start, 'end>(&'start [u8], &'end [u8]); + +impl<'start, 'end> std::fmt::Debug for DebugKeyRange<'start, 'end> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let end_key = if self.1.is_empty() { + Either::Left("inf") + } else { + Either::Right(redact(&self.1)) + }; + let end_key: &dyn std::fmt::Display = match &end_key { + Either::Left(x) => x, + Either::Right(y) => y, + }; + write!(f, "[{},{})", redact(&self.0), end_key) + } +} + +/// make a [`Region`](kvproto::metapb::Region) implements [`slog::KV`], which +/// prints its fields like `[r.id=xxx] [r.ver=xxx] ...` +pub fn slog_region(r: &Region) -> impl slog::KV + '_ { + SlogRegion(r) +} + +/// make a [`Region`](kvproto::metapb::Region) implements +/// [`Debug`](std::fmt::Debug), which prints its essential fields. +pub fn debug_region(r: &Region) -> impl std::fmt::Debug + '_ { + DebugRegion(r) +} + +struct DebugRegion<'a>(&'a Region); + +impl<'a> std::fmt::Debug for DebugRegion<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let r = self.0; + f.debug_struct("Region") + .field("id", &r.get_id()) + .field("ver", &r.get_region_epoch().get_version()) + .field("conf_ver", &r.get_region_epoch().get_conf_ver()) + .field( + "range", + &debug_key_range(r.get_start_key(), r.get_end_key()), + ) + .field( + "peers", + &debug_iter(r.get_peers().iter().map(|p| p.store_id)), + ) + .finish() + } +} + +struct SlogRegion<'a>(&'a Region); + +impl<'a> slog::KV for SlogRegion<'a> { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let r = self.0; + serializer.emit_u64("r.id", r.get_id())?; + serializer.emit_u64("r.ver", r.get_region_epoch().get_version())?; + serializer.emit_u64("r.conf_ver", r.get_region_epoch().get_conf_ver())?; + serializer.emit_arguments( + "r.range", + &format_args!("{:?}", debug_key_range(r.get_start_key(), r.get_end_key())), + )?; + serializer.emit_arguments( + "r.peers", + &format_args!("{:?}", debug_iter(r.get_peers().iter().map(|p| p.store_id))), + )?; + Ok(()) + } +} + +/// A shortcut for making an opaque future type for return type or argument +/// type, which is sendable and not borrowing any variables. +/// +/// `future![T]` == `impl Future + Send + 'static` +#[macro_export] +macro_rules! future { + ($t:ty) => { impl core::future::Future + Send + 'static }; +} + +pub fn debug_iter(t: impl Iterator) -> impl std::fmt::Debug { + DebugIter(RefCell::new(t)) +} + +struct DebugIter>(RefCell); + +impl> std::fmt::Debug for DebugIter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut is_first = true; + while let Some(x) = self.0.borrow_mut().next() { + if !is_first { + write!(f, ",{:?}", x)?; + } else { + write!(f, "{:?}", x)?; + is_first = false; + } + } + Ok(()) + } +} + #[cfg(test)] mod test { - use crate::utils::SegmentMap; + use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, + }; + + use engine_traits::WriteOptions; + use futures::executor::block_on; + use kvproto::metapb::{Region, RegionEpoch}; + use tokio::io::{AsyncWriteExt, BufReader}; + + use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; + + #[test] + fn test_redact() { + log_wrappers::set_redact_info_log(true); + let mut region = Region::default(); + region.set_id(42); + region.set_start_key(b"TiDB".to_vec()); + region.set_end_key(b"TiDC".to_vec()); + region.set_region_epoch({ + let mut r = RegionEpoch::default(); + r.set_version(108); + r.set_conf_ver(352); + r + }); + + // Can we make a better way to test this? + assert_eq!( + "Region { id: 42, ver: 108, conf_ver: 352, range: [?,?), peers: }", + format!("{:?}", super::debug_region(®ion)) + ); + + let range = super::debug_key_range(b"alpha", b"omega"); + assert_eq!("[?,?)", format!("{:?}", range)); + } + + #[test] + fn test_range_functions() { + #[derive(Debug)] + struct InRangeCase<'a> { + key: &'a [u8], + range: (&'a [u8], &'a [u8]), + expected: bool, + } + + let cases = [ + InRangeCase { + key: b"0001", + range: (b"0000", b"0002"), + expected: true, + }, + InRangeCase { + key: b"0003", + range: (b"0000", b"0002"), + expected: false, + }, + InRangeCase { + key: b"0002", + range: (b"0000", b"0002"), + expected: false, + }, + InRangeCase { + key: b"0000", + range: (b"0000", b"0002"), + expected: true, + }, + InRangeCase { + key: b"0018", + range: (b"0000", b""), + expected: true, + }, + InRangeCase { + key: b"0018", + range: (b"0019", b""), + expected: false, + }, + ]; + + for case in cases { + assert!( + is_in_range(case.key, case.range) == case.expected, + "case = {:?}", + case + ); + } + } #[test] fn test_segment_tree() { @@ -427,4 +966,193 @@ mod test { assert!(tree.is_overlapping((&2, &10))); assert!(tree.is_overlapping((&0, &9999999))); } + + #[test] + fn test_wait_group() { + #[derive(Debug)] + struct Case { + bg_task: usize, + repeat: usize, + } + + fn run_case(c: Case) { + for i in 0..c.repeat { + let wg = CallbackWaitGroup::new(); + let cnt = Arc::new(AtomicUsize::new(c.bg_task)); + for _ in 0..c.bg_task { + let cnt = cnt.clone(); + let work = wg.clone().work(); + tokio::spawn(async move { + cnt.fetch_sub(1, Ordering::SeqCst); + drop(work); + }); + } + block_on(tokio::time::timeout(Duration::from_secs(20), wg.wait())).unwrap(); + assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i); + } + } + + let cases = [ + Case { + bg_task: 200000, + repeat: 1, + }, + Case { + bg_task: 65535, + repeat: 1, + }, + Case { + bg_task: 512, + repeat: 1, + }, + Case { + bg_task: 2, + repeat: 100000, + }, + Case { + bg_task: 1, + repeat: 100000, + }, + Case { + bg_task: 0, + repeat: 1, + }, + ]; + + let pool = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_time() + .build() + .unwrap(); + let _guard = pool.handle().enter(); + for case in cases { + run_case(case) + } + } + + #[test] + fn test_recorder() { + use engine_traits::{Iterable, KvEngine, Mutable, WriteBatch, WriteBatchExt, CF_DEFAULT}; + use tempdir::TempDir; + + let p = TempDir::new("test_db").unwrap(); + let engine = + engine_rocks::util::new_engine(p.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); + let mut wb = engine.write_batch(); + for i in 0..100 { + wb.put_cf(CF_DEFAULT, format!("hello{}", i).as_bytes(), b"world") + .unwrap(); + } + let mut wopt = WriteOptions::new(); + wopt.set_sync(true); + wb.write_opt(&wopt).unwrap(); + // force memtable to disk. + engine.get_sync_db().compact_range(None, None); + + let (items, size) = super::with_record_read_throughput(|| { + let mut items = vec![]; + let snap = engine.snapshot(); + snap.scan(CF_DEFAULT, b"", b"", false, |k, v| { + items.push((k.to_owned(), v.to_owned())); + Ok(true) + }) + .unwrap(); + items + }); + + let items_size = items.iter().map(|(k, v)| k.len() + v.len()).sum::() as u64; + + // considering the compression, we may get at least 1/2 of the real size. + assert!( + size > items_size / 2, + "the size recorded is too small: {} vs {}", + size, + items_size + ); + // considering the read amplification, we may get at most 2x of the real size. + assert!( + size < items_size * 2, + "the size recorded is too big: {} vs {}", + size, + items_size + ); + } + + #[tokio::test] + async fn test_files_reader() { + use tempdir::TempDir; + use tokio::{fs::File, io::AsyncReadExt}; + + use super::FilesReader; + + let dir = TempDir::new("test_files").unwrap(); + let files_num = 5; + let mut files_path = Vec::new(); + let mut expect_content = String::new(); + for i in 0..files_num { + let path = dir.path().join(format!("f{}", i)); + let mut file = File::create(&path).await.unwrap(); + let content = format!("{i}_{i}_{i}_{i}_{i}\n{i}{i}{i}{i}\n").repeat(10); + file.write_all(content.as_bytes()).await.unwrap(); + file.sync_all().await.unwrap(); + + files_path.push(path); + expect_content.push_str(&content); + } + + let mut files = Vec::new(); + for i in 0..files_num { + let file = File::open(&files_path[i]).await.unwrap(); + files.push(file); + } + + let mut files_reader = FilesReader::new(files); + let mut read_content = String::new(); + files_reader + .read_to_string(&mut read_content) + .await + .unwrap(); + assert_eq!(expect_content, read_content); + } + + #[tokio::test] + async fn test_compression_writer() { + use kvproto::brpb::CompressionType; + use tempdir::TempDir; + use tokio::{fs::File, io::AsyncReadExt}; + + use super::compression_writer_dispatcher; + + let dir = TempDir::new("test_files").unwrap(); + let content = "test for compression writer. try to write to local path, and read it back."; + + // uncompressed writer + let path1 = dir.path().join("f1"); + let mut writer = compression_writer_dispatcher(path1.clone(), CompressionType::Unknown) + .await + .unwrap(); + writer.write_all(content.as_bytes()).await.unwrap(); + writer.as_mut().done().await.unwrap(); + + let mut reader = BufReader::new(File::open(path1).await.unwrap()); + let mut read_content = String::new(); + reader.read_to_string(&mut read_content).await.unwrap(); + assert_eq!(content, read_content); + + // zstd compressed writer + let path2 = dir.path().join("f2"); + let mut writer = compression_writer_dispatcher(path2.clone(), CompressionType::Zstd) + .await + .unwrap(); + writer.write_all(content.as_bytes()).await.unwrap(); + writer.as_mut().done().await.unwrap(); + + use async_compression::tokio::bufread::ZstdDecoder; + let mut reader = ZstdDecoder::new(BufReader::new(File::open(path2).await.unwrap())); + let mut read_content = String::new(); + reader.read_to_string(&mut read_content).await.unwrap(); + + println!("1{}2,{}", read_content, read_content.len()); + assert_eq!(content, read_content); + } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 064b954d7bf..9dc38e36320 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -9,20 +9,32 @@ use std::{ time::Duration, }; +use async_compression::futures::write::ZstdDecoder; use backup_stream::{ - metadata::{store::SlashEtcStore, MetadataClient, StreamTask}, + errors::Result, + metadata::{ + keys::{KeyValue, MetaKey}, + store::{MetaStore, SlashEtcStore}, + MetadataClient, StreamTask, + }, observer::BackupStreamObserver, router::Router, - Endpoint, Task, + utils, BackupStreamResolver, Endpoint, GetCheckpointResult, RegionCheckpointOperation, + RegionSet, Service, Task, }; -use futures::{executor::block_on, Future}; -use grpcio::ChannelBuilder; +use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt}; +use grpcio::{ChannelBuilder, Server, ServerBuilder}; use kvproto::{ - brpb::{Local, StorageBackend}, + brpb::{CompressionType, Local, Metadata, StorageBackend}, kvrpcpb::*, + logbackuppb::{SubscribeFlushEventRequest, SubscribeFlushEventResponse}, + logbackuppb_grpc::{create_log_backup, LogBackupClient}, tikvpb::*, }; use pd_client::PdClient; +use protobuf::parse_from_bytes; +use raftstore::router::CdcRaftRouter; +use resolved_ts::LeadershipResolver; use tempdir::TempDir; use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; @@ -40,8 +52,12 @@ use txn_types::{Key, TimeStamp, WriteRef}; use walkdir::WalkDir; fn mutation(k: Vec, v: Vec) -> Mutation { + mutation_op(k, v, Op::Put) +} + +fn mutation_op(k: Vec, v: Vec, op: Op) -> Mutation { let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); + mutation.set_op(op); mutation.key = k; mutation.value = v; mutation @@ -78,13 +94,148 @@ fn make_encoded_record_key(table_id: i64, handle: u64, ts: u64) -> Vec { key.append_ts(TimeStamp::new(ts)).into_encoded() } +#[derive(Clone)] +struct ErrorStore { + inner: S, + + error_provider: Arc Result<()> + Send + Sync>, +} + +pub struct SuiteBuilder { + name: String, + nodes: usize, + metastore_error: Box Result<()> + Send + Sync>, + cfg: Box, +} + +impl SuiteBuilder { + pub fn new_named(s: &str) -> Self { + Self { + name: s.to_owned(), + nodes: 4, + metastore_error: Box::new(|_| Ok(())), + cfg: Box::new(|cfg| { + cfg.enable = true; + }), + } + } + + pub fn nodes(mut self, n: usize) -> Self { + self.nodes = n; + self + } + + pub fn inject_meta_store_error(mut self, f: F) -> Self + where + F: Fn(&str) -> Result<()> + Send + Sync + 'static, + { + self.metastore_error = Box::new(f); + self + } + + pub fn cfg(mut self, f: impl FnOnce(&mut BackupStreamConfig) + 'static) -> Self { + let old_f = self.cfg; + self.cfg = Box::new(move |cfg| { + old_f(cfg); + f(cfg); + }); + self + } + + pub fn build(self) -> Suite { + let Self { + name: case, + nodes: n, + metastore_error, + cfg: cfg_f, + } = self; + + info!("start test"; "case" => %case, "nodes" => %n); + let cluster = new_server_cluster(42, n); + let mut suite = Suite { + endpoints: Default::default(), + meta_store: ErrorStore { + inner: Default::default(), + + error_provider: Arc::from(metastore_error), + }, + obs: Default::default(), + tikv_cli: Default::default(), + log_backup_cli: Default::default(), + servers: Default::default(), + env: Arc::new(grpcio::Environment::new(1)), + cluster, + + temp_files: TempDir::new("temp").unwrap(), + flushed_files: TempDir::new("flush").unwrap(), + case_name: case, + }; + for id in 1..=(n as u64) { + let worker = suite.start_br_stream_on(id); + suite.endpoints.insert(id, worker); + } + suite.cluster.run(); + let mut cfg = BackupStreamConfig::default(); + cfg_f(&mut cfg); + for id in 1..=(n as u64) { + suite.start_endpoint(id, cfg.clone()); + let cli = suite.start_log_backup_client_on(id); + suite.log_backup_cli.insert(id, cli); + } + // We must wait until the endpoints get ready to watching the metastore, or some + // modifies may be lost. Either make Endpoint::with_client wait until watch did + // start or make slash_etc support multi-version, then we can get rid of this + // sleep. + std::thread::sleep(Duration::from_secs(1)); + suite + } +} + +#[async_trait::async_trait] +impl MetaStore for ErrorStore { + type Snap = S::Snap; + + async fn snapshot(&self) -> backup_stream::errors::Result { + (self.error_provider)("snapshot")?; + self.inner.snapshot().await + } + + async fn watch( + &self, + keys: backup_stream::metadata::store::Keys, + start_rev: i64, + ) -> backup_stream::errors::Result { + (self.error_provider)("watch")?; + self.inner.watch(keys, start_rev).await + } + + async fn txn( + &self, + txn: backup_stream::metadata::store::Transaction, + ) -> backup_stream::errors::Result<()> { + (self.error_provider)("txn")?; + self.inner.txn(txn).await + } + + async fn txn_cond( + &self, + txn: backup_stream::metadata::store::CondTransaction, + ) -> backup_stream::errors::Result<()> { + (self.error_provider)("txn_cond")?; + self.inner.txn_cond(txn).await + } +} + pub struct Suite { endpoints: HashMap>, - meta_store: SlashEtcStore, + meta_store: ErrorStore, cluster: Cluster, tikv_cli: HashMap, + log_backup_cli: HashMap, obs: HashMap, env: Arc, + // The place to make services live as long as suite. + servers: Vec, temp_files: TempDir, flushed_files: TempDir, @@ -103,6 +254,7 @@ impl Suite { storage.set_local(local); task.info.set_storage(storage); task.info.set_table_filter(vec!["*.*".to_owned()].into()); + task.info.set_compression_type(CompressionType::Zstd); task } @@ -123,17 +275,87 @@ impl Suite { worker } - fn start_endpoint(&mut self, id: u64) { + /// create a subscription stream. this has simply asserted no error, because + /// in theory observing flushing should not emit error. change that if + /// needed. + fn flush_stream( + &self, + panic_while_fail: bool, + ) -> impl Stream { + let streams = self + .log_backup_cli + .iter() + .map(|(id, cli)| { + let stream = cli + .subscribe_flush_event(&{ + let mut r = SubscribeFlushEventRequest::default(); + r.set_client_id(format!("test-{}", id)); + r + }) + .unwrap_or_else(|err| panic!("failed to subscribe on {} because {}", id, err)); + let id = *id; + stream.filter_map(move |x| { + futures::future::ready(match x { + Ok(x) => Some((id, x)), + Err(err) => { + if panic_while_fail { + panic!("failed to rec from {} because {}", id, err) + } else { + println!("[WARN] failed to rec from {} because {}", id, err); + None + } + } + }) + }) + }) + .collect::>(); + + futures::stream::select_all(streams) + } + + fn start_log_backup_client_on(&mut self, id: u64) -> LogBackupClient { + let endpoint = self + .endpoints + .get(&id) + .expect("must register endpoint first"); + + let serv = Service::new(endpoint.scheduler()); + let builder = + ServerBuilder::new(self.env.clone()).register_service(create_log_backup(serv)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(self.env.clone()).connect(&addr); + println!("connecting channel to {} for store {}", addr, id); + let client = LogBackupClient::new(channel); + self.servers.push(server); + client + } + + fn start_endpoint(&mut self, id: u64, mut cfg: BackupStreamConfig) { let cluster = &mut self.cluster; let worker = self.endpoints.get_mut(&id).unwrap(); let sim = cluster.sim.wl(); let raft_router = sim.get_server_router(id); + let raft_router = CdcRaftRouter(raft_router); let cm = sim.get_concurrency_manager(id); let regions = sim.region_info_accessors.get(&id).unwrap().clone(); - let mut cfg = BackupStreamConfig::default(); + let ob = self.obs.get(&id).unwrap().clone(); cfg.enable = true; cfg.temp_path = format!("/{}/{}", self.temp_files.path().display(), id); - let ob = self.obs.get(&id).unwrap().clone(); + let resolver = LeadershipResolver::new( + id, + cluster.pd_client.clone(), + Arc::clone(&self.env), + Arc::clone(&sim.security_mgr), + cluster.store_metas[&id] + .lock() + .unwrap() + .region_read_progress + .clone(), + Duration::from_secs(60), + ); let endpoint = Endpoint::new( id, self.meta_store.clone(), @@ -144,41 +366,12 @@ impl Suite { raft_router, cluster.pd_client.clone(), cm, + BackupStreamResolver::V1(resolver), ); worker.start(endpoint); } - pub fn new(case: &str, n: usize) -> Self { - let cluster = new_server_cluster(42, n); - let mut suite = Self { - endpoints: Default::default(), - meta_store: Default::default(), - obs: Default::default(), - tikv_cli: Default::default(), - env: Arc::new(grpcio::Environment::new(1)), - cluster, - - temp_files: TempDir::new("temp").unwrap(), - flushed_files: TempDir::new("flush").unwrap(), - case_name: case.to_owned(), - }; - for id in 1..=(n as u64) { - let worker = suite.start_br_stream_on(id); - suite.endpoints.insert(id, worker); - } - suite.cluster.run(); - for id in 1..=(n as u64) { - suite.start_endpoint(id); - } - // TODO: The current mock metastore (slash_etc) doesn't supports multi-version. - // We must wait until the endpoints get ready to watching the metastore, or some modifies may be lost. - // Either make Endpoint::with_client wait until watch did start or make slash_etc support multi-version, - // then we can get rid of this sleep. - std::thread::sleep(Duration::from_secs(1)); - suite - } - - fn get_meta_cli(&self) -> MetadataClient { + fn get_meta_cli(&self) -> MetadataClient> { MetadataClient::new(self.meta_store.clone(), 0) } @@ -201,6 +394,47 @@ impl Suite { self.wait_with(move |r| block_on(r.get_task_info(&name)).is_ok()) } + /// This function tries to calculate the global checkpoint from the flush + /// status of nodes. + /// + /// NOTE: this won't check the region consistency for now, the checkpoint + /// may be weaker than expected. + fn global_checkpoint(&self) -> u64 { + let (tx, rx) = std::sync::mpsc::channel(); + self.run(|| { + let tx = tx.clone(); + Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Universal, + Box::new(move |rs| rs.into_iter().for_each(|x| tx.send(x).unwrap())), + )) + }); + drop(tx); + + rx.into_iter() + .map(|r| match r { + GetCheckpointResult::Ok { checkpoint, region } => { + info!("getting checkpoint"; "checkpoint" => %checkpoint, utils::slog_region(®ion)); + checkpoint.into_inner() + } + GetCheckpointResult::NotFound { .. } + | GetCheckpointResult::EpochNotMatch { .. } => { + unreachable!() + } + }) + .min() + .unwrap_or(0) + } + + async fn advance_global_checkpoint(&self, task: &str) -> Result<()> { + let cp = self.global_checkpoint(); + self.meta_store + .set(KeyValue( + MetaKey::central_global_checkpoint_of(task), + cp.to_be_bytes().to_vec(), + )) + .await + } + async fn write_records(&mut self, from: usize, n: usize, for_table: i64) -> HashSet> { let mut inserted = HashSet::default(); for ts in (from..(from + n)).map(|x| x * 2) { @@ -222,6 +456,19 @@ impl Suite { inserted } + fn commit_keys(&mut self, keys: Vec>, start_ts: TimeStamp, commit_ts: TimeStamp) { + let mut region_keys = HashMap::>>::new(); + for k in keys { + let enc_key = Key::from_raw(&k).into_encoded(); + let region = self.cluster.get_region_id(&enc_key); + region_keys.entry(region).or_default().push(k); + } + + for (region, keys) in region_keys { + self.must_kv_commit(region, keys, start_ts, commit_ts); + } + } + fn just_commit_a_key(&mut self, key: Vec, start_ts: TimeStamp, commit_ts: TimeStamp) { let enc_key = Key::from_raw(&key).into_encoded(); let region = self.cluster.get_region_id(&enc_key); @@ -239,7 +486,9 @@ impl Suite { } fn force_flush_files(&self, task: &str) { - self.run(|| Task::ForceFlush(task.to_owned())) + // TODO: use the callback to make the test more stable. + self.run(|| Task::ForceFlush(task.to_owned())); + self.sync(); } fn run(&self, mut t: impl FnMut() -> Task) { @@ -248,7 +497,42 @@ impl Suite { } } - fn check_for_write_records<'a>( + fn load_metadata_for_write_records(&self, path: &Path) -> HashMap> { + let mut meta_map: HashMap> = HashMap::new(); + for entry in WalkDir::new(path) { + let entry = entry.unwrap(); + if entry.file_type().is_file() + && entry + .file_name() + .to_str() + .map_or(false, |s| s.ends_with(".meta")) + { + let content = std::fs::read(entry.path()).unwrap(); + let meta = parse_from_bytes::(content.as_ref()).unwrap(); + for g in meta.file_groups.into_iter() { + let path = g.path.split('/').last().unwrap(); + for f in g.data_files_info.into_iter() { + let file_info = meta_map.get_mut(path); + if let Some(v) = file_info { + v.push(( + f.range_offset as usize, + (f.range_offset + f.range_length) as usize, + )); + } else { + let v = vec![( + f.range_offset as usize, + (f.range_offset + f.range_length) as usize, + )]; + meta_map.insert(String::from(path), v); + } + } + } + } + } + meta_map + } + + async fn check_for_write_records<'a>( &self, path: &Path, key_set: impl std::iter::Iterator, @@ -257,6 +541,7 @@ impl Suite { let n = remain_keys.len(); let mut extra_key = 0; let mut extra_len = 0; + let meta_map = self.load_metadata_for_write_records(path); for entry in WalkDir::new(path) { let entry = entry.unwrap(); println!("checking: {:?}", entry); @@ -266,21 +551,31 @@ impl Suite { .to_str() .map_or(false, |s| s.ends_with(".log")) { - let content = std::fs::read(entry.path()).unwrap(); - let mut iter = EventIterator::new(content); - loop { - if !iter.valid() { - break; - } - iter.next().unwrap(); - if !remain_keys.remove(iter.key()) { - extra_key += 1; - extra_len += iter.key().len() + iter.value().len(); - } + let buf = std::fs::read(entry.path()).unwrap(); + let file_infos = meta_map.get(entry.file_name().to_str().unwrap()).unwrap(); + for &file_info in file_infos { + let mut decoder = ZstdDecoder::new(Vec::new()); + let pbuf: &[u8] = &buf[file_info.0..file_info.1]; + decoder.write_all(pbuf).await.unwrap(); + decoder.flush().await.unwrap(); + decoder.close().await.unwrap(); + let content = decoder.into_inner(); + + let mut iter = EventIterator::new(&content); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !remain_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } - let value = iter.value(); - let wf = WriteRef::parse(value).unwrap(); - assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); + let value = iter.value(); + let wf = WriteRef::parse(value).unwrap(); + assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); + } } } } @@ -310,6 +605,36 @@ impl Suite { // Copy & Paste from cdc::tests::TestSuite, maybe make it a mixin? impl Suite { + pub fn tso(&self) -> TimeStamp { + run_async_test(self.cluster.pd_client.get_tso()).unwrap() + } + + pub fn must_kv_pessimistic_lock( + &mut self, + region_id: u64, + keys: Vec>, + ts: TimeStamp, + pk: Vec, + ) { + let mut lock_req = PessimisticLockRequest::new(); + lock_req.set_context(self.get_context(region_id)); + let mut mutations = vec![]; + for key in keys { + mutations.push(mutation_op(key, vec![], Op::PessimisticLock)); + } + lock_req.set_mutations(mutations.into()); + lock_req.primary_lock = pk; + lock_req.start_version = ts.into_inner(); + lock_req.lock_ttl = ts.into_inner() + 1; + let resp = self + .get_tikv_client(region_id) + .kv_pessimistic_lock(&lock_req) + .unwrap(); + + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); + } + pub fn must_kv_prewrite( &mut self, region_id: u64, @@ -452,6 +777,10 @@ impl Suite { pub fn wait_for_flush(&self) { use std::ffi::OsString; + std::fs::File::open(&self.temp_files) + .unwrap() + .sync_all() + .unwrap(); for _ in 0..100 { if !walkdir::WalkDir::new(&self.temp_files) .into_iter() @@ -478,7 +807,7 @@ impl Suite { let leader = self.cluster.leader_of_region(region_id); for peer in region.get_peers() { if leader.as_ref().map(|p| p.id != peer.id).unwrap_or(true) { - self.cluster.transfer_leader(region_id, peer.clone()); + self.cluster.must_transfer_leader(region_id, peer.clone()); self.cluster.reset_leader_of_region(region_id); return; } @@ -497,18 +826,32 @@ fn run_async_test(test: impl Future) -> T { #[cfg(test)] mod test { - use std::time::Duration; + use std::time::{Duration, Instant}; - use backup_stream::{errors::Error, metadata::MetadataClient, Task}; + use backup_stream::{ + errors::Error, + metadata::{ + keys::MetaKey, + store::{Keys, MetaStore}, + }, + router::TaskSelector, + GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, + }; + use futures::{executor::block_on, Stream, StreamExt}; + use pd_client::PdClient; + use test_raftstore::IsolationFilterFactory; use tikv_util::{box_err, defer, info, HandyRwLock}; - use txn_types::TimeStamp; + use tokio::time::timeout; + use txn_types::{Key, TimeStamp}; - use crate::{make_record_key, make_split_key_at_record, run_async_test}; + use crate::{ + make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, + }; #[test] fn basic() { - // test_util::init_log_for_test(); - let mut suite = super::Suite::new("basic", 4); + let mut suite = super::SuiteBuilder::new_named("basic").build(); + fail::cfg("try_start_observe", "1*return").unwrap(); run_async_test(async { // write data before the task starting, for testing incremental scanning. @@ -518,18 +861,19 @@ mod test { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_basic"); suite.wait_for_flush(); - suite.check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ); + suite + .check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ) + .await; }); suite.cluster.shutdown(); } #[test] fn with_split() { - // test_util::init_log_for_test(); - let mut suite = super::Suite::new("with_split", 4); + let mut suite = super::SuiteBuilder::new_named("with_split").build(); run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; suite.must_split(&make_split_key_at_record(1, 42)); @@ -537,19 +881,112 @@ mod test { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_with_split"); suite.wait_for_flush(); - suite.check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), + suite + .check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ) + .await; + }); + suite.cluster.shutdown(); + } + + /// This test tests whether we can handle some weird transactions and their + /// race with initial scanning. + /// Generally, those transactions: + /// - Has N mutations, which's values are all short enough to be inlined in + /// the `Write` CF. (N > 1024) + /// - Commit the mutation set M first. (for all m in M: Nth-Of-Key(m) > + /// 1024) + /// ```text + /// |--...-----^------*---*-*--*-*-*-> (The line is the Key Space - from "" to inf) + /// +The 1024th key (* = committed mutation) + /// ``` + /// - Before committing remaining mutations, PiTR triggered initial + /// scanning. + /// - The remaining mutations are committed before the instant when initial + /// scanning get the snapshot. + #[test] + fn with_split_txn() { + let mut suite = super::SuiteBuilder::new_named("split_txn").build(); + run_async_test(async { + let start_ts = suite.cluster.pd_client.get_tso().await.unwrap(); + let keys = (1..1960).map(|i| make_record_key(1, i)).collect::>(); + suite.must_kv_prewrite( + 1, + keys.clone() + .into_iter() + .map(|k| mutation(k, b"hello, world".to_vec())) + .collect(), + make_record_key(1, 1913), + start_ts, ); + let commit_ts = suite.cluster.pd_client.get_tso().await.unwrap(); + suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); + suite.must_register_task(1, "test_split_txn"); + suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); + suite.force_flush_files("test_split_txn"); + suite.wait_for_flush(); + let keys_encoded = keys + .iter() + .map(|v| { + Key::from_raw(v.as_slice()) + .append_ts(commit_ts) + .into_encoded() + }) + .collect::>(); + suite + .check_for_write_records( + suite.flushed_files.path(), + keys_encoded.iter().map(Vec::as_slice), + ) + .await; }); suite.cluster.shutdown(); } + #[test] + fn frequent_initial_scan() { + let mut suite = super::SuiteBuilder::new_named("frequent_initial_scan") + .cfg(|c| c.num_threads = 1) + .build(); + let keys = (1..1024).map(|i| make_record_key(1, i)).collect::>(); + let start_ts = suite.tso(); + suite.must_kv_prewrite( + 1, + keys.clone() + .into_iter() + .map(|k| mutation(k, b"hello, world".to_vec())) + .collect(), + make_record_key(1, 886), + start_ts, + ); + fail::cfg("scan_after_get_snapshot", "pause").unwrap(); + suite.must_register_task(1, "frequent_initial_scan"); + let commit_ts = suite.tso(); + suite.commit_keys(keys, start_ts, commit_ts); + suite.run(|| { + Task::ModifyObserve(backup_stream::ObserveOp::Stop { + region: suite.cluster.get_region(&make_record_key(1, 886)), + }) + }); + suite.run(|| { + Task::ModifyObserve(backup_stream::ObserveOp::Start { + region: suite.cluster.get_region(&make_record_key(1, 886)), + }) + }); + fail::cfg("scan_after_get_snapshot", "off").unwrap(); + suite.force_flush_files("frequent_initial_scan"); + suite.wait_for_flush(); + std::thread::sleep(Duration::from_secs(1)); + let c = suite.global_checkpoint(); + assert!(c > commit_ts.into_inner(), "{} vs {}", c, commit_ts); + } + #[test] /// This case tests whether the backup can continue when the leader failes. fn leader_down() { - // test_util::init_log_for_test(); - let mut suite = super::Suite::new("leader_down", 4); + let mut suite = super::SuiteBuilder::new_named("leader_down").build(); suite.must_register_task(1, "test_leader_down"); suite.sync(); let round1 = run_async_test(suite.write_records(0, 128, 1)); @@ -558,19 +995,20 @@ mod test { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("test_leader_down"); suite.wait_for_flush(); - suite.check_for_write_records( + run_async_test(suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - ); + )); suite.cluster.shutdown(); } #[test] - /// This case tests whehter the checkpoint ts (next backup ts) can be advanced correctly - /// when async commit is enabled. + /// This case tests whether the checkpoint ts (next backup ts) can be + /// advanced correctly when async commit is enabled. fn async_commit() { - // test_util::init_log_for_test(); - let mut suite = super::Suite::new("async_commit", 3); + let mut suite = super::SuiteBuilder::new_named("async_commit") + .nodes(3) + .build(); run_async_test(async { suite.must_register_task(1, "test_async_commit"); suite.sync(); @@ -579,20 +1017,11 @@ mod test { suite.write_records(258, 128, 1).await; suite.force_flush_files("test_async_commit"); std::thread::sleep(Duration::from_secs(4)); - let cli = MetadataClient::new(suite.meta_store.clone(), 1); - assert_eq!( - cli.global_progress_of_task("test_async_commit") - .await - .unwrap(), - 256 - ); + assert_eq!(suite.global_checkpoint(), 256); suite.just_commit_a_key(make_record_key(1, 256), TimeStamp::new(256), ts); suite.force_flush_files("test_async_commit"); suite.wait_for_flush(); - let cp = cli - .global_progress_of_task("test_async_commit") - .await - .unwrap(); + let cp = suite.global_checkpoint(); assert!(cp > 256, "it is {:?}", cp); }); suite.cluster.shutdown(); @@ -600,99 +1029,469 @@ mod test { #[test] fn fatal_error() { - // test_util::init_log_for_test(); - let mut suite = super::Suite::new("fatal_error", 3); + let mut suite = super::SuiteBuilder::new_named("fatal_error") + .nodes(3) + .build(); suite.must_register_task(1, "test_fatal_error"); suite.sync(); run_async_test(suite.write_records(0, 1, 1)); suite.force_flush_files("test_fatal_error"); suite.wait_for_flush(); + run_async_test(suite.advance_global_checkpoint("test_fatal_error")).unwrap(); let (victim, endpoint) = suite.endpoints.iter().next().unwrap(); endpoint .scheduler() .schedule(Task::FatalError( - "test_fatal_error".to_owned(), + TaskSelector::ByName("test_fatal_error".to_owned()), Box::new(Error::Other(box_err!("everything is alright"))), )) .unwrap(); - let meta_cli = suite.get_meta_cli(); suite.sync(); - let err = run_async_test(meta_cli.get_last_error("test_fatal_error", *victim)) - .unwrap() - .unwrap(); + let err = run_async_test( + suite + .get_meta_cli() + .get_last_error("test_fatal_error", *victim), + ) + .unwrap() + .unwrap(); info!("err"; "err" => ?err); assert_eq!(err.error_code, error_code::backup_stream::OTHER.code); assert!(err.error_message.contains("everything is alright")); assert_eq!(err.store_id, *victim); - let paused = run_async_test(meta_cli.check_task_paused("test_fatal_error")).unwrap(); + let paused = + run_async_test(suite.get_meta_cli().check_task_paused("test_fatal_error")).unwrap(); assert!(paused); let safepoints = suite.cluster.pd_client.gc_safepoints.rl(); - let checkpoint = run_async_test( - suite - .get_meta_cli() - .global_progress_of_task("test_fatal_error"), - ) - .unwrap(); - assert_eq!(safepoints.len(), 4, "{:?}", safepoints); + let checkpoint = suite.global_checkpoint(); + assert!( - safepoints - .iter() - .take(3) - // They are choosing the lock safepoint, it must greater than the global checkpoint. - .all(|sp| { sp.safepoint.into_inner() >= checkpoint }), + safepoints.iter().any(|sp| { + sp.serivce.contains(&format!("{}", victim)) + && sp.ttl >= Duration::from_secs(60 * 60 * 24) + && sp.safepoint.into_inner() == checkpoint - 1 + }), "{:?}", safepoints ); + } - let sp = &safepoints[3]; - assert!(sp.serivce.contains(&format!("{}", victim)), "{:?}", sp); - assert!(sp.ttl >= Duration::from_secs(60 * 60 * 24), "{:?}", sp); + #[test] + fn region_checkpoint_info() { + let mut suite = super::SuiteBuilder::new_named("checkpoint_info") + .nodes(1) + .build(); + suite.must_register_task(1, "checkpoint_info"); + suite.must_split(&make_split_key_at_record(1, 42)); + run_async_test(suite.write_records(0, 128, 1)); + suite.force_flush_files("checkpoint_info"); + suite.wait_for_flush(); + std::thread::sleep(Duration::from_secs(1)); + let (tx, rx) = std::sync::mpsc::channel(); + suite.run(|| { + let tx = tx.clone(); + Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Universal, + Box::new(move |rs| { + tx.send(rs).unwrap(); + }), + )) + }); + let checkpoints = rx.recv().unwrap(); + assert!(!checkpoints.is_empty(), "{:?}", checkpoints); assert!( - sp.safepoint.into_inner() == checkpoint, - "{:?} vs {}", - sp, - checkpoint + checkpoints + .iter() + .all(|cp| matches!(cp, GetCheckpointResult::Ok { checkpoint, .. } if checkpoint.into_inner() > 256)), + "{:?}", + checkpoints ); } #[test] - fn inflight_messages() { - test_util::init_log_for_test(); - // We should remove the failpoints when paniked or we may get stucked. + fn region_failure() { defer! {{ - fail::remove("delay_on_start_observe"); - fail::remove("delay_on_flush"); + fail::remove("try_start_observe"); }} - let mut suite = super::Suite::new("inflight_message", 3); - suite.must_register_task(1, "inflight_message"); - run_async_test(suite.write_records(0, 128, 1)); - fail::cfg("delay_on_flush", "pause").unwrap(); - suite.force_flush_files("inflight_message"); - fail::cfg("delay_on_start_observe", "pause").unwrap(); + let mut suite = SuiteBuilder::new_named("region_failure").build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + fail::cfg("try_start_observe", "1*return").unwrap(); + suite.must_register_task(1, "region_failure"); suite.must_shuffle_leader(1); - // Handling the `StartObserve` message and doing flush are executed asynchronously. - // Make a delay of unblocking flush thread for make sure we have handled the `StartObserve`. - std::thread::sleep(Duration::from_secs(1)); - fail::cfg("delay_on_flush", "off").unwrap(); + let keys2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("region_failure"); suite.wait_for_flush(); - let checkpoint = run_async_test( - suite - .get_meta_cli() - .global_progress_of_task("inflight_message"), + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + keys.union(&keys2).map(|s| s.as_slice()), + )); + } + + #[test] + fn initial_scan_failure() { + defer! {{ + fail::remove("scan_and_async_send"); + }} + + let mut suite = SuiteBuilder::new_named("initial_scan_failure") + .nodes(1) + .build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + fail::cfg( + "scan_and_async_send", + "1*return(dive into the temporary dream, where the SLA never bothers)", + ) + .unwrap(); + suite.must_register_task(1, "initial_scan_failure"); + let keys2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("initial_scan_failure"); + suite.wait_for_flush(); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + keys.union(&keys2).map(|s| s.as_slice()), + )); + } + + #[test] + fn upload_checkpoint_exits_in_time() { + defer! {{ + std::env::remove_var("LOG_BACKUP_UGC_SLEEP_AND_RETURN"); + }} + let suite = SuiteBuilder::new_named("upload_checkpoint_exits_in_time") + .nodes(1) + .build(); + std::env::set_var("LOG_BACKUP_UGC_SLEEP_AND_RETURN", "meow"); + let (_, victim) = suite.endpoints.iter().next().unwrap(); + let sched = victim.scheduler(); + sched + .schedule(Task::UpdateGlobalCheckpoint("greenwoods".to_owned())) + .unwrap(); + let start = Instant::now(); + let (tx, rx) = tokio::sync::oneshot::channel(); + sched + .schedule(Task::Sync( + Box::new(move || { + tx.send(Instant::now()).unwrap(); + }), + Box::new(|_| true), + )) + .unwrap(); + let end = run_async_test(rx).unwrap(); + assert!( + end - start < Duration::from_secs(10), + "take = {:?}", + end - start + ); + } + + #[test] + fn failed_during_refresh_region() { + defer! { + fail::remove("get_last_checkpoint_of") + } + + let mut suite = SuiteBuilder::new_named("fail_to_refresh_region") + .nodes(1) + .build(); + + suite.must_register_task(1, "fail_to_refresh_region"); + let keys = run_async_test(suite.write_records(0, 128, 1)); + fail::cfg( + "get_last_checkpoint_of", + "1*return(the stream handler wants to become a batch processor, and the batch processor wants to be a stream handler.)", + ).unwrap(); + + suite.must_split(b"SOLE"); + let keys2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("fail_to_refresh_region"); + suite.wait_for_flush(); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + keys.union(&keys2).map(|s| s.as_slice()), + )); + let leader = suite.cluster.leader_of_region(1).unwrap().store_id; + let (tx, rx) = std::sync::mpsc::channel(); + suite.endpoints[&leader] + .scheduler() + .schedule(Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Universal, + Box::new(move |rs| { + let _ = tx.send(rs); + }), + ))) + .unwrap(); + + let regions = rx.recv_timeout(Duration::from_secs(10)).unwrap(); + assert!( + regions.iter().all(|item| { + matches!(item, GetCheckpointResult::Ok { checkpoint, .. } if checkpoint.into_inner() > 500) + }), + "{:?}", + regions ); - fail::cfg("delay_on_start_observe", "off").unwrap(); - // The checkpoint should not advance if there are inflight messages. - assert_eq!(checkpoint.unwrap(), 0); - run_async_test(suite.write_records(256, 128, 1)); - suite.force_flush_files("inflight_message"); + } + + /// This test case tests whether we correctly handle the pessimistic locks. + #[test] + fn pessimistic_lock() { + let mut suite = SuiteBuilder::new_named("pessimistic_lock").nodes(3).build(); + suite.must_kv_pessimistic_lock( + 1, + vec![make_record_key(1, 42)], + suite.tso(), + make_record_key(1, 42), + ); + suite.must_register_task(1, "pessimistic_lock"); + suite.must_kv_pessimistic_lock( + 1, + vec![make_record_key(1, 43)], + suite.tso(), + make_record_key(1, 43), + ); + let expected_tso = suite.tso().into_inner(); + suite.force_flush_files("pessimistic_lock"); suite.wait_for_flush(); + std::thread::sleep(Duration::from_secs(1)); + run_async_test(suite.advance_global_checkpoint("pessimistic_lock")).unwrap(); let checkpoint = run_async_test( suite .get_meta_cli() - .global_progress_of_task("inflight_message"), + .global_progress_of_task("pessimistic_lock"), ) .unwrap(); - // The checkpoint should be advanced as expection when the inflight message has been consumed. - assert!(checkpoint > 512, "checkpoint = {}", checkpoint); + // The checkpoint should be advanced: because PiTR is "Read" operation, + // which shouldn't be blocked by pessimistic locks. + assert!( + checkpoint > expected_tso, + "expected = {}; checkpoint = {}", + expected_tso, + checkpoint + ); + } + + async fn collect_all_current( + mut s: impl Stream + Unpin, + max_gap: Duration, + ) -> Vec { + let mut r = vec![]; + while let Ok(Some(x)) = timeout(max_gap, s.next()).await { + r.push(x); + } + r + } + + async fn collect_current(mut s: impl Stream + Unpin, goal: usize) -> Vec { + let mut r = vec![]; + while let Ok(Some(x)) = timeout(Duration::from_secs(10), s.next()).await { + r.push(x); + if r.len() >= goal { + return r; + } + } + r + } + + #[test] + fn subscribe_flushing() { + let mut suite = super::SuiteBuilder::new_named("sub_flush").build(); + let stream = suite.flush_stream(true); + for i in 1..10 { + let split_key = make_split_key_at_record(1, i * 20); + suite.must_split(&split_key); + suite.must_shuffle_leader(suite.cluster.get_region_id(&split_key)); + } + + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "sub_flush"); + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite.sync(); + suite.force_flush_files("sub_flush"); + + let mut items = run_async_test(async { + collect_current( + stream.flat_map(|(_, r)| futures::stream::iter(r.events.into_iter())), + 10, + ) + .await + }); + + items.sort_by(|x, y| x.start_key.cmp(&y.start_key)); + + println!("{:?}", items); + assert_eq!(items.len(), 10); + + assert_eq!(items.first().unwrap().start_key, Vec::::default()); + for w in items.windows(2) { + let a = &w[0]; + let b = &w[1]; + assert!(a.checkpoint > 512); + assert!(b.checkpoint > 512); + assert_eq!(a.end_key, b.start_key); + } + assert_eq!(items.last().unwrap().end_key, Vec::::default()); + + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(|x| x.as_slice()), + )); + } + + #[test] + fn failure_and_split() { + let mut suite = super::SuiteBuilder::new_named("failure_and_split") + .nodes(1) + .build(); + fail::cfg("try_start_observe0", "pause").unwrap(); + + // write data before the task starting, for testing incremental scanning. + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "failure_and_split"); + suite.sync(); + + suite.must_split(&make_split_key_at_record(1, 42)); + suite.sync(); + std::thread::sleep(Duration::from_millis(200)); + fail::cfg("try_start_observe", "2*return").unwrap(); + fail::cfg("try_start_observe0", "off").unwrap(); + + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("failure_and_split"); + suite.wait_for_flush(); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + )); + let cp = suite.global_checkpoint(); + assert!(cp > 512, "it is {}", cp); + suite.cluster.shutdown(); + } + + #[test] + fn resolved_follower() { + let mut suite = super::SuiteBuilder::new_named("r").build(); + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "r"); + suite.run(|| Task::RegionCheckpointsOp(RegionCheckpointOperation::PrepareMinTsForResolve)); + suite.sync(); + std::thread::sleep(Duration::from_secs(1)); + + let leader = suite.cluster.leader_of_region(1).unwrap(); + suite.must_shuffle_leader(1); + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite + .endpoints + .get(&leader.store_id) + .unwrap() + .scheduler() + .schedule(Task::ForceFlush("r".to_owned())) + .unwrap(); + suite.sync(); + std::thread::sleep(Duration::from_secs(2)); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.iter().map(|x| x.as_slice()), + )); + assert!(suite.global_checkpoint() > 256); + suite.force_flush_files("r"); + suite.wait_for_flush(); + assert!(suite.global_checkpoint() > 512); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(|x| x.as_slice()), + )); + } + + #[test] + fn network_partition() { + let mut suite = super::SuiteBuilder::new_named("network_partition") + .nodes(3) + .build(); + let stream = suite.flush_stream(true); + suite.must_register_task(1, "network_partition"); + let leader = suite.cluster.leader_of_region(1).unwrap(); + let round1 = run_async_test(suite.write_records(0, 64, 1)); + + suite + .cluster + .add_send_filter(IsolationFilterFactory::new(leader.store_id)); + suite.cluster.reset_leader_of_region(1); + suite + .cluster + .must_wait_for_leader_expire(leader.store_id, 1); + let leader2 = suite.cluster.leader_of_region(1).unwrap(); + assert_ne!(leader.store_id, leader2.store_id, "leader not switched."); + let ts = suite.tso(); + suite.must_kv_prewrite( + 1, + vec![mutation(make_record_key(1, 778), b"generator".to_vec())], + make_record_key(1, 778), + ts, + ); + suite.sync(); + suite.force_flush_files("network_partition"); + suite.wait_for_flush(); + + let cps = run_async_test(collect_all_current(stream, Duration::from_secs(2))); + assert!( + cps.iter() + .flat_map(|(_s, cp)| cp.events.iter().map(|resp| resp.checkpoint)) + .all(|cp| cp <= ts.into_inner()), + "ts={} cps={:?}", + ts, + cps + ); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.iter().map(|k| k.as_slice()), + )) + } + + #[test] + fn test_retry_abort() { + let mut suite = super::SuiteBuilder::new_named("retry_abort") + .nodes(1) + .build(); + defer! { + fail::list().into_iter().for_each(|(name, _)| fail::remove(name)) + }; + + suite.must_register_task(1, "retry_abort"); + fail::cfg("subscribe_mgr_retry_start_observe_delay", "return(10)").unwrap(); + fail::cfg("try_start_observe", "return()").unwrap(); + + suite.must_split(&make_split_key_at_record(1, 42)); + std::thread::sleep(Duration::from_secs(2)); + + let error = run_async_test(suite.get_meta_cli().get_last_error("retry_abort", 1)).unwrap(); + let error = error.expect("no error uploaded"); + error + .get_error_message() + .find("retry") + .expect("error doesn't contain retry"); + fail::cfg("try_start_observe", "10*return()").unwrap(); + // Resume the task manually... + run_async_test(async { + suite + .meta_store + .delete(Keys::Key(MetaKey::pause_of("retry_abort"))) + .await?; + suite + .meta_store + .delete(Keys::Prefix(MetaKey::last_errors_of("retry_abort"))) + .await?; + backup_stream::errors::Result::Ok(()) + }) + .unwrap(); + + suite.sync(); + suite.wait_with(move |r| block_on(r.get_task_info("retry_abort")).is_ok()); + let items = run_async_test(suite.write_records(0, 128, 1)); + suite.force_flush_files("retry_abort"); + suite.wait_for_flush(); + run_async_test( + suite.check_for_write_records( + suite.flushed_files.path(), + items.iter().map(Vec::as_slice), + ), + ); } } diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index effe13c4e08..4f12dd04c36 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -33,46 +33,48 @@ mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] [dependencies] -api_version = { path = "../api_version", default-features = false } +api_version = { workspace = true } async-channel = "1.4" -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +aws = { workspace = true } +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crc64fast = "0.1" -encryption = { path = "../encryption", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } -external_storage = { path = "../external_storage", default-features = false } -external_storage_export = { path = "../external_storage/export", default-features = false } -file_system = { path = "../file_system", default-features = false } +encryption = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +external_storage = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } hex = "0.4" -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +online_config = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } -security = { path = "../security", default-features = false } +raftstore = { workspace = true } +security = { workspace = true } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } thiserror = "1.0" -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_common = { workspace = true } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-stream = "0.1" -txn_types = { path = "../txn_types", default-features = false } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +txn_types = { workspace = true } +yatp = { workspace = true } [dev-dependencies] rand = "0.8" diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 3a737ba52d2..4fb1705ebab 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -9,25 +9,25 @@ use std::{ }; use async_channel::SendError; +use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use concurrency_manager::ConcurrencyManager; -use engine_rocks::raw::DB; -use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, SstCompressionType}; +use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, KvEngine, SstCompressionType}; use external_storage::{BackendConfig, HdfsConfig}; use external_storage_export::{create_storage, ExternalStorage}; -use futures::channel::mpsc::*; +use futures::{channel::mpsc::*, executor::block_on}; use kvproto::{ brpb::*, encryptionpb::EncryptionMethod, - kvrpcpb::{ApiVersion, Context, IsolationLevel}, + kvrpcpb::{ApiVersion, Context, IsolationLevel, KeyRange}, metapb::*, }; use online_config::OnlineConfig; use raft::StateRole; -use raftstore::{coprocessor::RegionInfoProvider, store::util::find_peer}; +use raftstore::coprocessor::RegionInfoProvider; use tikv::{ config::BackupConfig, storage::{ - kv::{CursorBuilder, Engine, ScanMode, SnapContext}, + kv::{CursorBuilder, Engine, LocalTablets, ScanMode, SnapContext}, mvcc::Error as MvccError, raw::raw_mvcc::RawMvccSnapshot, txn::{EntryBatch, Error as TxnError, SnapshotStore, TxnEntryScanner, TxnEntryStore}, @@ -36,6 +36,7 @@ use tikv::{ }; use tikv_util::{ box_err, debug, error, error_unknown, impl_display_as_debug, info, + store::find_peer, time::{Instant, Limiter}, warn, worker::Runnable, @@ -57,6 +58,7 @@ const BACKUP_BATCH_LIMIT: usize = 1024; struct Request { start_key: Vec, end_key: Vec, + sub_ranges: Vec, start_ts: TimeStamp, end_ts: TimeStamp, limiter: Limiter, @@ -68,6 +70,7 @@ struct Request { compression_type: CompressionType, compression_level: i32, cipher: CipherInfo, + replica_read: bool, } /// Backup Task. @@ -117,6 +120,7 @@ impl Task { request: Request { start_key: req.get_start_key().to_owned(), end_key: req.get_end_key().to_owned(), + sub_ranges: req.get_sub_ranges().to_owned(), start_ts: req.get_start_version().into(), end_ts: req.get_end_version().into(), backend: req.get_storage_backend().clone(), @@ -127,6 +131,7 @@ impl Task { cf, compression_type: req.get_compression_type(), compression_level: req.get_compression_level(), + replica_read: req.get_replica_read(), cipher: req.cipher_info.unwrap_or_else(|| { let mut cipher = CipherInfo::default(); cipher.set_cipher_type(EncryptionMethod::Plaintext); @@ -149,19 +154,20 @@ pub struct BackupRange { start_key: Option, end_key: Option, region: Region, - leader: Peer, + peer: Peer, codec: KeyValueCodec, cf: CfName, + uses_replica_read: bool, } /// The generic saveable writer. for generic `InMemBackupFiles`. /// Maybe what we really need is make Writer a trait... -enum KvWriter { - Txn(BackupWriter), - Raw(BackupRawKvWriter), +enum KvWriter { + Txn(BackupWriter), + Raw(BackupRawKvWriter), } -impl std::fmt::Debug for KvWriter { +impl std::fmt::Debug for KvWriter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Txn(_) => f.debug_tuple("Txn").finish(), @@ -170,7 +176,7 @@ impl std::fmt::Debug for KvWriter { } } -impl KvWriter { +impl KvWriter { async fn save(self, storage: &dyn ExternalStorage) -> Result> { match self { Self::Txn(writer) => writer.save(storage).await, @@ -187,8 +193,8 @@ impl KvWriter { } #[derive(Debug)] -struct InMemBackupFiles { - files: KvWriter, +struct InMemBackupFiles { + files: KvWriter, start_key: Vec, end_key: Vec, start_version: TimeStamp, @@ -196,8 +202,8 @@ struct InMemBackupFiles { region: Region, } -async fn save_backup_file_worker( - rx: async_channel::Receiver, +async fn save_backup_file_worker( + rx: async_channel::Receiver>, tx: UnboundedSender, storage: Arc, codec: KeyValueCodec, @@ -206,20 +212,30 @@ async fn save_backup_file_worker( let files = if msg.files.need_flush_keys() { match msg.files.save(&storage).await { Ok(mut split_files) => { + let mut has_err = false; for file in split_files.iter_mut() { // In the case that backup from v1 and restore to v2, // the file range need be encoded as v2 format. // And range in response keep in v1 format. - let (start, end) = codec.convert_key_range_to_dst_version( + let ret = codec.convert_key_range_to_dst_version( msg.start_key.clone(), msg.end_key.clone(), ); + if ret.is_err() { + has_err = true; + break; + } + let (start, end) = ret.unwrap(); file.set_start_key(start); file.set_end_key(end); file.set_start_version(msg.start_version.into_inner()); file.set_end_version(msg.end_version.into_inner()); } - Ok(split_files) + if has_err { + Err(box_err!("backup convert key range failed")) + } else { + Ok(split_files) + } } Err(e) => { error_unknown!(?e; "backup save file failed"); @@ -259,10 +275,10 @@ async fn save_backup_file_worker( /// Send the save task to the save worker. /// Record the wait time at the same time. -async fn send_to_worker_with_metrics( - tx: &async_channel::Sender, - files: InMemBackupFiles, -) -> std::result::Result<(), SendError> { +async fn send_to_worker_with_metrics( + tx: &async_channel::Sender>, + files: InMemBackupFiles, +) -> std::result::Result<(), SendError>> { let files = match tx.try_send(files) { Ok(_) => return Ok(()), Err(e) => e.into_inner(), @@ -277,46 +293,58 @@ impl BackupRange { /// Get entries from the scanner and save them to storage async fn backup( &self, - writer_builder: BackupWriterBuilder, - engine: E, + writer_builder: BackupWriterBuilder, + mut engine: E, concurrency_manager: ConcurrencyManager, backup_ts: TimeStamp, begin_ts: TimeStamp, - saver: async_channel::Sender, + saver: async_channel::Sender>, + storage_name: &str, ) -> Result { assert!(!self.codec.is_raw_kv); let mut ctx = Context::default(); ctx.set_region_id(self.region.get_id()); ctx.set_region_epoch(self.region.get_region_epoch().to_owned()); - ctx.set_peer(self.leader.clone()); - - // Update max_ts and check the in-memory lock table before getting the snapshot - concurrency_manager.update_max_ts(backup_ts); - concurrency_manager - .read_range_check( - self.start_key.as_ref(), - self.end_key.as_ref(), - |key, lock| { - Lock::check_ts_conflict( - Cow::Borrowed(lock), - key, - backup_ts, - &Default::default(), - IsolationLevel::Si, - ) - }, - ) - .map_err(MvccError::from) - .map_err(TxnError::from)?; + ctx.set_peer(self.peer.clone()); + ctx.set_replica_read(self.uses_replica_read); + ctx.set_isolation_level(IsolationLevel::Si); - // Currently backup always happens on the leader, so we don't need - // to set key ranges and start ts to check. - assert!(!ctx.get_replica_read()); - let snap_ctx = SnapContext { + let mut snap_ctx = SnapContext { pb_ctx: &ctx, + allowed_in_flashback: self.region.is_in_flashback, ..Default::default() }; + if self.uses_replica_read { + snap_ctx.start_ts = Some(backup_ts); + let mut key_range = KeyRange::default(); + if let Some(start_key) = self.start_key.as_ref() { + key_range.set_start_key(start_key.clone().into_encoded()); + } + if let Some(end_key) = self.end_key.as_ref() { + key_range.set_end_key(end_key.clone().into_encoded()); + } + snap_ctx.key_ranges = vec![key_range]; + } else { + // Update max_ts and check the in-memory lock table before getting the snapshot + concurrency_manager.update_max_ts(backup_ts); + concurrency_manager + .read_range_check( + self.start_key.as_ref(), + self.end_key.as_ref(), + |key, lock| { + Lock::check_ts_conflict( + Cow::Borrowed(lock), + key, + backup_ts, + &Default::default(), + IsolationLevel::Si, + ) + }, + ) + .map_err(MvccError::from) + .map_err(TxnError::from)?; + } let start_snapshot = Instant::now(); let snapshot = match engine.snapshot(snap_ctx) { @@ -333,7 +361,7 @@ impl BackupRange { snapshot, backup_ts, IsolationLevel::Si, - false, /* fill_cache */ + false, // fill_cache Default::default(), Default::default(), false, @@ -352,7 +380,7 @@ impl BackupRange { .start_key .clone() .map_or_else(Vec::new, |k| k.into_raw().unwrap()); - let mut writer = writer_builder.build(next_file_start_key.clone())?; + let mut writer = writer_builder.build(next_file_start_key.clone(), storage_name)?; loop { if let Err(e) = scanner.scan_entries(&mut batch) { error!(?e; "backup scan entries failed"); @@ -386,7 +414,7 @@ impl BackupRange { send_to_worker_with_metrics(&saver, msg).await?; next_file_start_key = this_end_key; writer = writer_builder - .build(next_file_start_key.clone()) + .build(next_file_start_key.clone(), storage_name) .map_err(|e| { error_unknown!(?e; "backup writer failed"); e @@ -431,9 +459,9 @@ impl BackupRange { Ok(stat) } - fn backup_raw( + fn backup_raw( &self, - writer: &mut BackupRawKvWriter, + writer: &mut BackupRawKvWriter, snapshot: &S, ) -> Result { assert!(self.codec.is_raw_kv); @@ -443,6 +471,7 @@ impl BackupRange { let mut cursor = CursorBuilder::new(snapshot, self.cf) .range(None, self.end_key.clone()) .scan_mode(ScanMode::Forward) + .fill_cache(false) .build()?; if let Some(begin) = self.start_key.clone() { if !cursor.seek(&begin, cfstatistics)? { @@ -457,7 +486,7 @@ impl BackupRange { while cursor.valid()? && batch.len() < BACKUP_BATCH_LIMIT { let key = cursor.key(cfstatistics); let value = cursor.value(cfstatistics); - let is_valid = self.codec.is_valid_raw_value(key, value, current_ts)?; + let (is_valid, expired) = self.codec.is_valid_raw_value(key, value, current_ts)?; if is_valid { batch.push(Ok(( self.codec @@ -465,6 +494,8 @@ impl BackupRange { .into_encoded(), self.codec.convert_encoded_value_to_dst_version(value)?, ))); + } else if expired { + cfstatistics.raw_value_tombstone += 1; }; debug!("backup raw key"; "key" => &log_wrappers::Value::key(&self.codec.convert_encoded_key_to_dst_version(key)?.into_encoded()), @@ -491,15 +522,15 @@ impl BackupRange { async fn backup_raw_kv_to_file( &self, - engine: E, - db: Arc, + mut engine: E, + db: E::Local, limiter: &Limiter, file_name: String, cf: CfNameWrap, compression_type: Option, compression_level: i32, cipher: CipherInfo, - saver_tx: async_channel::Sender, + saver_tx: async_channel::Sender>, ) -> Result { let mut writer = match BackupRawKvWriter::new( db, @@ -521,7 +552,8 @@ impl BackupRange { let mut ctx = Context::default(); ctx.set_region_id(self.region.get_id()); ctx.set_region_epoch(self.region.get_region_epoch().to_owned()); - ctx.set_peer(self.leader.clone()); + ctx.set_peer(self.peer.clone()); + let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -565,8 +597,7 @@ pub struct ConfigManager(Arc>); impl online_config::ConfigManager for ConfigManager { fn dispatch(&mut self, change: online_config::ConfigChange) -> online_config::Result<()> { - self.0.write().unwrap().update(change); - Ok(()) + self.0.write().unwrap().update(change) } } @@ -647,11 +678,12 @@ pub struct Endpoint { store_id: u64, pool: RefCell, io_pool: Runtime, - db: Arc, + tablets: LocalTablets, config_manager: ConfigManager, concurrency_manager: ConcurrencyManager, softlimit: SoftLimitKeeper, api_version: ApiVersion, + causal_ts_provider: Option>, // used in rawkv apiv2 only pub(crate) engine: E, pub(crate) region_info: R, @@ -660,6 +692,8 @@ pub struct Endpoint { /// The progress of a backup task pub struct Progress { store_id: u64, + ranges: Vec<(Option, Option)>, + next_index: usize, next_start: Option, end_key: Option, region_info: R, @@ -669,7 +703,7 @@ pub struct Progress { } impl Progress { - fn new( + fn new_with_range( store_id: u64, next_start: Option, end_key: Option, @@ -677,21 +711,48 @@ impl Progress { codec: KeyValueCodec, cf: CfName, ) -> Self { - Progress { + let ranges = vec![(next_start, end_key)]; + Self::new_with_ranges(store_id, ranges, region_info, codec, cf) + } + + fn new_with_ranges( + store_id: u64, + ranges: Vec<(Option, Option)>, + region_info: R, + codec: KeyValueCodec, + cf: CfName, + ) -> Self { + let mut prs = Progress { store_id, - next_start, - end_key, + ranges, + next_index: 0, + next_start: None, + end_key: None, region_info, finished: false, codec, cf, + }; + prs.try_next(); + prs + } + + /// try the next range. If all the ranges are consumed, + /// set self.finish true. + fn try_next(&mut self) { + if self.ranges.len() > self.next_index { + (self.next_start, self.end_key) = self.ranges[self.next_index].clone(); + + self.next_index += 1; + } else { + self.finished = true; } } /// Forward the progress by `ranges` BackupRanges /// /// The size of the returned BackupRanges should <= `ranges` - fn forward(&mut self, limit: usize) -> Vec { + fn forward(&mut self, limit: usize, replica_read: bool) -> Vec { if self.finished { return Vec::new(); } @@ -721,18 +782,20 @@ impl Progress { break; } } - if info.role == StateRole::Leader { + let peer = find_peer(region, store_id).unwrap().to_owned(); + // Raft peer role has to match the replica read flag. + if replica_read || info.role == StateRole::Leader { let ekey = get_min_end_key(end_key.as_ref(), region); let skey = get_max_start_key(start_key.as_ref(), region); assert!(!(skey == ekey && ekey.is_some()), "{:?} {:?}", skey, ekey); - let leader = find_peer(region, store_id).unwrap().to_owned(); let backup_range = BackupRange { start_key: skey, end_key: ekey, region: region.clone(), - leader, + peer, codec, cf: cf_name, + uses_replica_read: info.role != StateRole::Leader, }; tx.send(backup_range).unwrap(); count += 1; @@ -754,11 +817,12 @@ impl Progress { // region, we need to set the `finished` flag here in case // we run with `next_start` set to None if b.region.get_end_key().is_empty() || b.end_key == self.end_key { - self.finished = true; + self.try_next(); + } else { + self.next_start = b.end_key.clone(); } - self.next_start = b.end_key.clone(); } else { - self.finished = true; + self.try_next(); } branges } @@ -769,10 +833,11 @@ impl Endpoint { store_id: u64, engine: E, region_info: R, - db: Arc, + tablets: LocalTablets, config: BackupConfig, concurrency_manager: ConcurrencyManager, api_version: ApiVersion, + causal_ts_provider: Option>, ) -> Endpoint { let pool = ControlThreadPool::new(); let rt = utils::create_tokio_runtime(config.io_thread_size, "backup-io").unwrap(); @@ -784,12 +849,13 @@ impl Endpoint { engine, region_info, pool: RefCell::new(pool), - db, + tablets, io_pool: rt, softlimit, config_manager, concurrency_manager, api_version, + causal_ts_provider, } } @@ -818,14 +884,14 @@ impl Endpoint { &self, prs: Arc>>, request: Request, - saver_tx: async_channel::Sender, + saver_tx: async_channel::Sender>, resp_tx: UnboundedSender, _backend: Arc, ) { let start_ts = request.start_ts; let backup_ts = request.end_ts; let engine = self.engine.clone(); - let db = self.db.clone(); + let tablets = self.tablets.clone(); let store_id = self.store_id; let concurrency_manager = self.concurrency_manager.clone(); let batch_size = self.config_manager.0.read().unwrap().batch_size; @@ -834,8 +900,8 @@ impl Endpoint { self.pool.borrow_mut().spawn(async move { loop { - // when get the guard, release it until we finish scanning a batch, - // because if we were suspended during scanning, + // when get the guard, release it until we finish scanning a batch, + // because if we were suspended during scanning, // the region info have higher possibility to change (then we must compensate that by the fine-grained backup). let guard = limit.guard().await; if let Err(e) = guard { @@ -856,7 +922,7 @@ impl Endpoint { // (See https://tokio.rs/tokio/tutorial/shared-state) // Use &mut and mark the type for making rust-analyzer happy. let progress: &mut Progress<_> = &mut prs.lock().unwrap(); - let batch = progress.forward(batch_size); + let batch = progress.forward(batch_size, request.replica_read); if batch.is_empty() { return; } @@ -878,14 +944,21 @@ impl Endpoint { let input = brange.codec.decode_backup_key(Some(k)).unwrap_or_default(); file_system::sha256(&input).ok().map(hex::encode) }); - let name = backup_file_name(store_id, &brange.region, key); + let name = backup_file_name(store_id, &brange.region, key, _backend.name()); let ct = to_sst_compression_type(request.compression_type); + let db = match tablets.get(brange.region.id) { + Some(t) => t, + None => { + warn!("backup region not found"; "region" => ?brange.region.id); + return; + } + }; let stat = if is_raw_kv { brange .backup_raw_kv_to_file( engine, - db.clone(), + db.into_owned(), &request.limiter, name, cf.into(), @@ -900,7 +973,7 @@ impl Endpoint { store_id, request.limiter.clone(), brange.region.clone(), - db.clone(), + db.into_owned(), ct, request.compression_level, sst_max_size, @@ -914,6 +987,7 @@ impl Endpoint { backup_ts, start_ts, saver_tx.clone(), + _backend.name(), ) .await }; @@ -927,6 +1001,7 @@ impl Endpoint { } } Ok(stat) => { + BACKUP_RAW_EXPIRED_COUNT.inc_by(stat.data.raw_value_tombstone as u64); // TODO: maybe add the stat to metrics? debug!("backup region finish"; "region" => ?brange.region, @@ -938,6 +1013,39 @@ impl Endpoint { }); } + fn get_progress_by_req( + &self, + request: &Request, + codec: KeyValueCodec, + ) -> Arc>> { + if request.sub_ranges.is_empty() { + let start_key = codec.encode_backup_key(request.start_key.clone()); + let end_key = codec.encode_backup_key(request.end_key.clone()); + Arc::new(Mutex::new(Progress::new_with_range( + self.store_id, + start_key, + end_key, + self.region_info.clone(), + codec, + request.cf, + ))) + } else { + let mut ranges = Vec::with_capacity(request.sub_ranges.len()); + for k in &request.sub_ranges { + let start_key = codec.encode_backup_key(k.start_key.clone()); + let end_key = codec.encode_backup_key(k.end_key.clone()); + ranges.push((start_key, end_key)); + } + Arc::new(Mutex::new(Progress::new_with_ranges( + self.store_id, + ranges, + self.region_info.clone(), + codec, + request.cf, + ))) + } + } + pub fn handle_backup_task(&self, task: Task) { let Task { request, resp } = task; let codec = KeyValueCodec::new(request.is_raw_kv, self.api_version, request.dst_api_ver); @@ -953,17 +1061,32 @@ impl Endpoint { } return; } - let start_key = codec.encode_backup_key(request.start_key.clone()); - let end_key = codec.encode_backup_key(request.end_key.clone()); + // Flush causal timestamp to make sure that future writes will have larger + // timestamps. And help TiKV-BR acquire a backup-ts with intact data + // smaller than it. (Note that intactness is not fully ensured now, + // until the safe-ts of RawKV is implemented. TiKV-BR need a workaround + // by rewinding backup-ts to a small "safe interval"). + if request.is_raw_kv { + if let Err(e) = self + .causal_ts_provider + .as_ref() + .map_or(Ok(TimeStamp::new(0)), |provider| { + block_on(provider.async_flush()) + }) + { + error!("backup flush causal timestamp failed"; "err" => ?e); + let mut response = BackupResponse::default(); + let err_msg = format!("fail to flush causal ts, {:?}", e); + response.set_error(crate::Error::Other(box_err!(err_msg)).into()); + if let Err(err) = resp.unbounded_send(response) { + error_unknown!(?err; "backup failed to send response"); + } + return; + } + } + + let prs = self.get_progress_by_req(&request, codec); - let prs = Arc::new(Mutex::new(Progress::new( - self.store_id, - start_key, - end_key, - self.region_info.clone(), - codec, - request.cf, - ))); let backend = match create_storage(&request.backend, self.get_config()) { Ok(backend) => backend, Err(err) => { @@ -979,7 +1102,6 @@ impl Endpoint { let backend = Arc::::from(backend); let concurrency = self.config_manager.0.read().unwrap().num_threads; self.pool.borrow_mut().adjust_with(concurrency); - // make the buffer small enough to implement back pressure. let (tx, rx) = async_channel::bounded(1); for _ in 0..concurrency { self.spawn_backup_worker( @@ -1052,30 +1174,65 @@ fn get_max_start_key(start_key: Option<&Key>, region: &Region) -> Option { } } -/// Construct an backup file name based on the given store id, region, range start key and local unix timestamp. -/// A name consists with five parts: store id, region_id, a epoch version, the hash of range start key and timestamp. -/// range start key is used to keep the unique file name for file, to handle different tables exists on the same region. -/// local unix timestamp is used to keep the unique file name for file, to handle receive the same request after connection reset. -pub fn backup_file_name(store_id: u64, region: &Region, key: Option) -> String { +/// Construct an backup file name based on the given store id, region, range +/// start key and local unix timestamp. A name consists with five parts: store +/// id, region_id, a epoch version, the hash of range start key and timestamp. +/// range start key is used to keep the unique file name for file, to handle +/// different tables exists on the same region. local unix timestamp is used to +/// keep the unique file name for file, to handle receive the same request after +/// connection reset. +pub fn backup_file_name( + store_id: u64, + region: &Region, + key: Option, + storage_name: &str, +) -> String { let start = SystemTime::now(); let since_the_epoch = start .duration_since(UNIX_EPOCH) .expect("Time went backwards"); - match key { - Some(k) => format!( - "{}_{}_{}_{}_{}", - store_id, - region.get_id(), - region.get_region_epoch().get_version(), - k, - since_the_epoch.as_millis() - ), - None => format!( - "{}_{}_{}", - store_id, - region.get_id(), - region.get_region_epoch().get_version() - ), + + match (key, storage_name) { + // See https://github.com/pingcap/tidb/issues/30087 + // To avoid 503 Slow Down error, if the backup storage is s3, + // organize the backup files by store_id (use slash (/) as delimiter). + (Some(k), aws::STORAGE_NAME | external_storage::local::STORAGE_NAME) => { + format!( + "{}/{}_{}_{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version(), + k, + since_the_epoch.as_millis() + ) + } + (Some(k), _) => { + format!( + "{}_{}_{}_{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version(), + k, + since_the_epoch.as_millis() + ) + } + + (None, aws::STORAGE_NAME | external_storage::local::STORAGE_NAME) => { + format!( + "{}/{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version() + ) + } + (None, _) => { + format!( + "{}_{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version() + ) + } } } @@ -1102,30 +1259,29 @@ pub mod tests { use std::{ fs, path::{Path, PathBuf}, - sync::Mutex, + sync::{Mutex, RwLock}, time::Duration, }; use api_version::{api_v2::RAW_KEY_PREFIX, dispatch_api_version, KvFormat, RawValue}; + use collections::HashSet; use engine_traits::MiscExt; use external_storage_export::{make_local_backend, make_noop_backend}; - use file_system::{IOOp, IORateLimiter, IOType}; + use file_system::{IoOp, IoRateLimiter, IoType}; use futures::{executor::block_on, stream::StreamExt}; use kvproto::metapb; - use raftstore::{ - coprocessor::{RegionCollector, Result as CopResult, SeekRegionCallback}, - store::util::new_peer, - }; + use raftstore::coprocessor::{RegionCollector, Result as CopResult, SeekRegionCallback}; use rand::Rng; use tempfile::TempDir; use tikv::{ coprocessor::checksum_crc64_xor, storage::{ + kv::LocalTablets, txn::tests::{must_commit, must_prewrite_put}, RocksEngine, TestEngineBuilder, }, }; - use tikv_util::config::ReadableSize; + use tikv_util::{config::ReadableSize, store::new_peer}; use tokio::time; use txn_types::SHORT_VALUE_MAX_LEN; @@ -1141,7 +1297,9 @@ pub mod tests { impl MockRegionInfoProvider { pub fn new(encode_key: bool) -> Self { MockRegionInfoProvider { - regions: Arc::new(Mutex::new(RegionCollector::new())), + regions: Arc::new(Mutex::new(RegionCollector::new(Arc::new(RwLock::new( + HashSet::default(), + ))))), cancel: None, need_encode_key: encode_key, } @@ -1171,6 +1329,38 @@ pub mod tests { map.create_region(r, StateRole::Leader); } } + pub fn add_region( + &self, + id: u64, + mut start_key: Vec, + mut end_key: Vec, + peer_role: metapb::PeerRole, + state_role: StateRole, + ) { + let mut region = metapb::Region::default(); + region.set_id(id); + if !start_key.is_empty() { + if self.need_encode_key { + start_key = Key::from_raw(&start_key).into_encoded(); + } else { + start_key = Key::from_encoded(start_key).into_encoded(); + } + } + if !end_key.is_empty() { + if self.need_encode_key { + end_key = Key::from_raw(&end_key).into_encoded(); + } else { + end_key = Key::from_encoded(end_key).into_encoded(); + } + } + region.set_start_key(start_key); + region.set_end_key(end_key); + let mut new_peer = new_peer(1, 1); + new_peer.set_role(peer_role); + region.mut_peers().push(new_peer); + let mut map = self.regions.lock().unwrap(); + map.create_region(region, state_role); + } fn canecl_on_seek(&mut self, cancel: Arc) { self.cancel = Some(cancel); } @@ -1189,18 +1379,19 @@ pub mod tests { } pub fn new_endpoint() -> (TempDir, Endpoint) { - new_endpoint_with_limiter(None, ApiVersion::V1, false) + new_endpoint_with_limiter(None, ApiVersion::V1, false, None) } pub fn new_endpoint_with_limiter( - limiter: Option>, + limiter: Option>, api_version: ApiVersion, is_raw_kv: bool, + causal_ts_provider: Option>, ) -> (TempDir, Endpoint) { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[ + .cfs([ engine_traits::CF_DEFAULT, engine_traits::CF_LOCK, engine_traits::CF_WRITE, @@ -1211,14 +1402,14 @@ pub mod tests { .unwrap(); let concurrency_manager = ConcurrencyManager::new(1.into()); let need_encode_key = !is_raw_kv || api_version == ApiVersion::V2; - let db = rocks.get_rocksdb().get_sync_db(); + let db = rocks.get_rocksdb(); ( temp, Endpoint::new( 1, rocks, MockRegionInfoProvider::new(need_encode_key), - db, + LocalTablets::Singleton(db), BackupConfig { num_threads: 4, batch_size: 8, @@ -1227,6 +1418,7 @@ pub mod tests { }, concurrency_manager, api_version, + causal_ts_provider, ), ) } @@ -1304,17 +1496,9 @@ pub mod tests { // Test seek backup range. let test_seek_backup_range = |start_key: &[u8], end_key: &[u8], expect: Vec<(&[u8], &[u8])>| { - let start_key = if start_key.is_empty() { - None - } else { - Some(Key::from_raw(start_key)) - }; - let end_key = if end_key.is_empty() { - None - } else { - Some(Key::from_raw(end_key)) - }; - let mut prs = Progress::new( + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + let mut prs = Progress::new_with_range( endpoint.store_id, start_key, end_key, @@ -1326,7 +1510,7 @@ pub mod tests { let mut ranges = Vec::with_capacity(expect.len()); while ranges.len() != expect.len() { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n); + let mut r = prs.forward(n, false); // The returned backup ranges should <= n assert!(r.len() <= n); @@ -1366,6 +1550,7 @@ pub mod tests { request: Request { start_key: start_key.to_vec(), end_key: end_key.to_vec(), + sub_ranges: Vec::new(), start_ts: 1.into(), end_ts: 1.into(), backend, @@ -1377,6 +1562,7 @@ pub mod tests { compression_type: CompressionType::Unknown, compression_level: 0, cipher: CipherInfo::default(), + replica_read: false, }, resp: tx, }; @@ -1432,12 +1618,298 @@ pub mod tests { } } + #[test] + fn test_backup_replica_read() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.add_region( + 1, + b"".to_vec(), + b"1".to_vec(), + metapb::PeerRole::Voter, + StateRole::Leader, + ); + endpoint.region_info.add_region( + 2, + b"1".to_vec(), + b"2".to_vec(), + metapb::PeerRole::Voter, + StateRole::Follower, + ); + endpoint.region_info.add_region( + 3, + b"2".to_vec(), + b"3".to_vec(), + metapb::PeerRole::Learner, + StateRole::Follower, + ); + + let tmp = TempDir::new().unwrap(); + let backend = make_local_backend(tmp.path()); + + let (tx, rx) = unbounded(); + let mut ranges = vec![]; + let key_range = KeyRange { + start_key: b"".to_vec(), + end_key: b"3".to_vec(), + ..Default::default() + }; + ranges.push(key_range); + let read_leader_task = Task { + request: Request { + start_key: b"1".to_vec(), + end_key: b"2".to_vec(), + sub_ranges: ranges.clone(), + start_ts: 1.into(), + end_ts: 1.into(), + backend: backend.clone(), + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + replica_read: false, + }, + resp: tx, + }; + endpoint.handle_backup_task(read_leader_task); + let resps: Vec<_> = block_on(rx.collect()); + assert_eq!(resps.len(), 1); + for a in &resps { + assert_eq!(a.get_start_key(), b""); + assert_eq!(a.get_end_key(), b"1"); + } + + let (tx, rx) = unbounded(); + let replica_read_task = Task { + request: Request { + start_key: b"".to_vec(), + end_key: b"3".to_vec(), + sub_ranges: ranges.clone(), + start_ts: 1.into(), + end_ts: 1.into(), + backend, + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + replica_read: true, + }, + resp: tx, + }; + endpoint.handle_backup_task(replica_read_task); + let resps: Vec<_> = block_on(rx.collect()); + let expected: Vec<(&[u8], &[u8])> = vec![(b"", b"1"), (b"1", b"2"), (b"2", b"3")]; + assert_eq!(resps.len(), 3); + for a in &resps { + assert!( + expected + .iter() + .any(|b| { a.get_start_key() == b.0 && a.get_end_key() == b.1 }), + "{:?} {:?}", + resps, + expected + ); + } + } + + #[test] + fn test_seek_ranges() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.set_regions(vec![ + (b"".to_vec(), b"1".to_vec(), 1), + (b"1".to_vec(), b"2".to_vec(), 2), + (b"3".to_vec(), b"4".to_vec(), 3), + (b"7".to_vec(), b"9".to_vec(), 4), + (b"9".to_vec(), b"".to_vec(), 5), + ]); + // Test seek backup range. + let test_seek_backup_ranges = + |sub_ranges: Vec<(&[u8], &[u8])>, expect: Vec<(&[u8], &[u8])>| { + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + ranges.push((start_key, end_key)); + } + let mut prs = Progress::new_with_ranges( + endpoint.store_id, + ranges, + endpoint.region_info.clone(), + KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + engine_traits::CF_DEFAULT, + ); + + let mut ranges = Vec::with_capacity(expect.len()); + while ranges.len() != expect.len() { + let n = (rand::random::() % 3) + 1; + let mut r = prs.forward(n, false); + // The returned backup ranges should <= n + assert!(r.len() <= n); + + if r.is_empty() { + // if return a empty vec then the progress is finished + assert_eq!( + ranges.len(), + expect.len(), + "got {:?}, expect {:?}", + ranges, + expect + ); + } + ranges.append(&mut r); + } + + for (a, b) in ranges.into_iter().zip(expect) { + assert_eq!( + a.start_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.0 + ); + assert_eq!( + a.end_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.1 + ); + } + }; + + // Test whether responses contain correct range. + #[allow(clippy::blocks_in_if_conditions)] + let test_handle_backup_task_ranges = + |sub_ranges: Vec<(&[u8], &[u8])>, expect: Vec<(&[u8], &[u8])>| { + let tmp = TempDir::new().unwrap(); + let backend = make_local_backend(tmp.path()); + let (tx, rx) = unbounded(); + + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let key_range = KeyRange { + start_key: start_key.to_vec(), + end_key: end_key.to_vec(), + ..Default::default() + }; + ranges.push(key_range); + } + let task = Task { + request: Request { + start_key: b"1".to_vec(), + end_key: b"2".to_vec(), + sub_ranges: ranges, + start_ts: 1.into(), + end_ts: 1.into(), + backend, + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + replica_read: false, + }, + resp: tx, + }; + endpoint.handle_backup_task(task); + let resps: Vec<_> = block_on(rx.collect()); + for a in &resps { + assert!( + expect + .iter() + .any(|b| { a.get_start_key() == b.0 && a.get_end_key() == b.1 }), + "{:?} {:?}", + resps, + expect + ); + } + assert_eq!(resps.len(), expect.len()); + }; + + // Backup range from case.0 to case.1, + // the case.2 is the expected results. + type Case<'a> = (Vec<(&'a [u8], &'a [u8])>, Vec<(&'a [u8], &'a [u8])>); + + let case: Vec> = vec![ + ( + vec![(b"", b"1"), (b"1", b"2")], + vec![(b"", b"1"), (b"1", b"2")], + ), + ( + vec![(b"", b"2"), (b"3", b"4")], + vec![(b"", b"1"), (b"1", b"2"), (b"3", b"4")], + ), + ( + vec![(b"7", b"8"), (b"8", b"9")], + vec![(b"7", b"8"), (b"8", b"9")], + ), + ( + vec![(b"8", b"9"), (b"6", b"8")], + vec![(b"8", b"9"), (b"7", b"8")], + ), + ( + vec![(b"8", b"85"), (b"88", b"89"), (b"7", b"8")], + vec![(b"8", b"85"), (b"88", b"89"), (b"7", b"8")], + ), + ( + vec![(b"8", b"85"), (b"", b"35"), (b"88", b"89"), (b"7", b"8")], + vec![ + (b"8", b"85"), + (b"", b"1"), + (b"1", b"2"), + (b"3", b"35"), + (b"88", b"89"), + (b"7", b"8"), + ], + ), + (vec![(b"", b"1")], vec![(b"", b"1")]), + (vec![(b"", b"2")], vec![(b"", b"1"), (b"1", b"2")]), + (vec![(b"1", b"2")], vec![(b"1", b"2")]), + (vec![(b"1", b"3")], vec![(b"1", b"2")]), + (vec![(b"1", b"4")], vec![(b"1", b"2"), (b"3", b"4")]), + (vec![(b"4", b"5")], vec![]), + (vec![(b"4", b"6")], vec![]), + (vec![(b"4", b"6"), (b"6", b"7")], vec![]), + (vec![(b"2", b"3"), (b"4", b"6"), (b"6", b"7")], vec![]), + (vec![(b"2", b"7")], vec![(b"3", b"4")]), + (vec![(b"7", b"8")], vec![(b"7", b"8")]), + ( + vec![(b"3", b"")], + vec![(b"3", b"4"), (b"7", b"9"), (b"9", b"")], + ), + (vec![(b"5", b"")], vec![(b"7", b"9"), (b"9", b"")]), + (vec![(b"7", b"")], vec![(b"7", b"9"), (b"9", b"")]), + (vec![(b"8", b"91")], vec![(b"8", b"9"), (b"9", b"91")]), + (vec![(b"8", b"")], vec![(b"8", b"9"), (b"9", b"")]), + ( + vec![(b"", b"")], + vec![ + (b"", b"1"), + (b"1", b"2"), + (b"3", b"4"), + (b"7", b"9"), + (b"9", b""), + ], + ), + ]; + for (ranges, expect_ranges) in case { + test_seek_backup_ranges(ranges.clone(), expect_ranges.clone()); + test_handle_backup_task_ranges(ranges, expect_ranges); + } + } + #[test] fn test_handle_backup_task() { - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); - let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), ApiVersion::V1, false); - let engine = endpoint.engine.clone(); + let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), ApiVersion::V1, false, None); + let mut engine = endpoint.engine.clone(); endpoint .region_info @@ -1453,24 +1925,24 @@ pub mod tests { let commit = alloc_ts(); let key = format!("{}", i); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), &vec![i; *len], key.as_bytes(), start, ); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); backup_tss.push((alloc_ts(), len)); } } // flush to disk so that read requests can be traced by TiKV limiter. engine .get_rocksdb() - .flush_cf(engine_traits::CF_DEFAULT, true /*sync*/) + .flush_cf(engine_traits::CF_DEFAULT, true /* sync */) .unwrap(); engine .get_rocksdb() - .flush_cf(engine_traits::CF_WRITE, true /*sync*/) + .flush_cf(engine_traits::CF_WRITE, true /* sync */) .unwrap(); // TODO: check key number for each snapshot. @@ -1505,14 +1977,14 @@ pub mod tests { info!("{:?}", files); assert_eq!( files.len(), - file_len, /* default and write */ + file_len, // default and write "{:?}", resp ); let (none, _rx) = block_on(rx.into_future()); assert!(none.is_none(), "{:?}", none); - assert_eq!(stats.fetch(IOType::Export, IOOp::Write), 0); - assert_ne!(stats.fetch(IOType::Export, IOOp::Read), 0); + assert_eq!(stats.fetch(IoType::Export, IoOp::Write), 0); + assert_ne!(stats.fetch(IoType::Export, IoOp::Read), 0); } } @@ -1524,7 +1996,10 @@ pub mod tests { format!("k{:0>10}", idx) }; if api_ver == ApiVersion::V2 { - key.insert(0, RAW_KEY_PREFIX as char); + // [0, 0, 0] is the default key space id. + let mut apiv2_key = [RAW_KEY_PREFIX, 0, 0, 0].to_vec(); + apiv2_key.extend(key.as_bytes()); + key = String::from_utf8(apiv2_key).unwrap(); } key } @@ -1543,10 +2018,10 @@ pub mod tests { }) } - fn generate_engine_test_value(user_value: String, api_ver: ApiVersion) -> Vec { + fn generate_engine_test_value(user_value: String, api_ver: ApiVersion, ttl: u64) -> Vec { let raw_value = RawValue { user_value: user_value.into_bytes(), - expire_ts: Some(u64::MAX), + expire_ts: Some(ttl), is_delete: false, }; dispatch_api_version!(api_ver, { @@ -1561,22 +2036,30 @@ pub mod tests { ) -> Key { if (cur_ver == ApiVersion::V1 || cur_ver == ApiVersion::V1ttl) && dst_ver == ApiVersion::V2 { - raw_key.insert(0, RAW_KEY_PREFIX as char); + // [0, 0, 0] is the default key space id. + let mut apiv2_key = [RAW_KEY_PREFIX, 0, 0, 0].to_vec(); + apiv2_key.extend(raw_key.as_bytes()); + raw_key = String::from_utf8(apiv2_key).unwrap(); } Key::from_encoded(raw_key.into_bytes()) } - fn test_handle_backup_raw_task_impl(cur_api_ver: ApiVersion, dst_api_ver: ApiVersion) -> bool { - let limiter = Arc::new(IORateLimiter::new_for_test()); + fn test_handle_backup_raw_task_impl( + cur_api_ver: ApiVersion, + dst_api_ver: ApiVersion, + test_ttl: bool, + ) -> bool { + let limiter = Arc::new(IoRateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); - let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), cur_api_ver, true); + let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), cur_api_ver, true, None); let engine = endpoint.engine.clone(); let start_key_idx: u64 = 100; let end_key_idx: u64 = 110; + let ttl_expire_cnt = 2; endpoint.region_info.set_regions(vec![( - vec![], //generate_test_raw_key(start_key_idx).into_bytes(), - vec![], //generate_test_raw_key(end_key_idx).into_bytes(), + vec![], // generate_test_raw_key(start_key_idx).into_bytes(), + vec![], // generate_test_raw_key(end_key_idx).into_bytes(), 1, )]); let ctx = Context::default(); @@ -1586,49 +2069,60 @@ pub mod tests { while i < end_key_idx { let key_str = generate_test_raw_key(i, cur_api_ver); let value_str = generate_test_raw_value(i, cur_api_ver); - let key = generate_engine_test_key(key_str.clone(), None, cur_api_ver); - let value = generate_engine_test_value(value_str.clone(), cur_api_ver); + let ttl = if test_ttl && i >= end_key_idx - ttl_expire_cnt { + 1 // let last `ttl_expire_cnt` value expired when backup + } else { + u64::MAX + }; + // engine do not append ts anymore, need write ts encoded key into engine. + let key = generate_engine_test_key(key_str.clone(), Some(i.into()), cur_api_ver); + let value = generate_engine_test_value(value_str.clone(), cur_api_ver, ttl); let dst_user_key = convert_test_backup_user_key(key_str, cur_api_ver, dst_api_ver); let dst_value = value_str.as_bytes(); - checksum = checksum_crc64_xor( - checksum, - digest.clone(), - dst_user_key.as_encoded(), - dst_value, - ); - let ret = engine.put(&ctx, key, value); - assert!(ret.is_ok()); + if ttl != 1 { + checksum = checksum_crc64_xor( + checksum, + digest.clone(), + dst_user_key.as_encoded(), + dst_value, + ); + } + engine.put(&ctx, key, value).unwrap(); i += 1; } // flush to disk so that read requests can be traced by TiKV limiter. engine .get_rocksdb() - .flush_cf(engine_traits::CF_DEFAULT, true /*sync*/) + .flush_cf(engine_traits::CF_DEFAULT, true /* sync */) .unwrap(); // TODO: check key number for each snapshot. stats.reset(); let mut req = BackupRequest::default(); let backup_start = if cur_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX] + vec![RAW_KEY_PREFIX, 0, 0, 0] // key space id takes 3 bytes. } else { vec![] }; let backup_end = if cur_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX + 1] + vec![RAW_KEY_PREFIX, 0, 0, 1] // [0, 0, 1] is the end of the file } else { vec![] }; let file_start = if dst_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX] + vec![RAW_KEY_PREFIX, 0, 0, 0] // key space id takes 3 bytes. } else { vec![] }; let file_end = if dst_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX + 1] + vec![RAW_KEY_PREFIX, 0, 0, 1] // [0, 0, 1] is the end of the file } else { vec![] }; + if test_ttl { + std::thread::sleep(Duration::from_secs(2)); // wait for ttl expired + } + let original_expire_cnt = BACKUP_RAW_EXPIRED_COUNT.get(); req.set_start_key(backup_start.clone()); req.set_end_key(backup_end.clone()); req.set_is_raw_kv(true); @@ -1648,14 +2142,26 @@ pub mod tests { assert!(resp.has_error()); return false; } + + let current_expire_cnt = BACKUP_RAW_EXPIRED_COUNT.get(); + let expect_expire_cnt = if test_ttl { + original_expire_cnt + ttl_expire_cnt + } else { + original_expire_cnt + }; + assert_eq!(expect_expire_cnt, current_expire_cnt); assert!(!resp.has_error(), "{:?}", resp); assert_eq!(resp.get_start_key(), backup_start); assert_eq!(resp.get_end_key(), backup_end); let file_len = 1; let files = resp.get_files(); info!("{:?}", files); - assert_eq!(files.len(), file_len /* default cf*/, "{:?}", resp); - assert_eq!(files[0].total_kvs, end_key_idx - start_key_idx); + let mut expect_cnt = end_key_idx - start_key_idx; + if test_ttl { + expect_cnt -= 2; + } + assert_eq!(files.len(), file_len /* default cf */, "{:?}", resp); + assert_eq!(files[0].total_kvs, expect_cnt); assert_eq!(files[0].crc64xor, checksum); assert_eq!(files[0].get_start_key(), file_start); assert_eq!(files[0].get_end_key(), file_end); @@ -1675,41 +2181,71 @@ pub mod tests { } as u64; assert_eq!( files[0].total_bytes, - (end_key_idx - start_key_idx - 1) * kv_backup_size + first_kv_backup_size + (expect_cnt - 1) * kv_backup_size + first_kv_backup_size ); let (none, _rx) = block_on(rx.into_future()); assert!(none.is_none(), "{:?}", none); - assert_eq!(stats.fetch(IOType::Export, IOOp::Write), 0); - assert_ne!(stats.fetch(IOType::Export, IOOp::Read), 0); + assert_eq!(stats.fetch(IoType::Export, IoOp::Write), 0); + assert_ne!(stats.fetch(IoType::Export, IoOp::Read), 0); true } #[test] fn test_handle_backup_raw() { - // (src_api_version, dst_api_version, result) + // (src_api_version, dst_api_version, test_ttl, result) let test_backup_cases = vec![ - (ApiVersion::V1, ApiVersion::V1, true), - (ApiVersion::V1ttl, ApiVersion::V1ttl, true), - (ApiVersion::V2, ApiVersion::V2, true), - (ApiVersion::V1, ApiVersion::V2, true), - (ApiVersion::V1ttl, ApiVersion::V2, true), - (ApiVersion::V1, ApiVersion::V1ttl, false), - (ApiVersion::V2, ApiVersion::V1, false), - (ApiVersion::V2, ApiVersion::V1ttl, false), - (ApiVersion::V1ttl, ApiVersion::V1, false), + (ApiVersion::V1, ApiVersion::V1, false, true), + (ApiVersion::V1ttl, ApiVersion::V1ttl, true, true), + (ApiVersion::V2, ApiVersion::V2, true, true), + (ApiVersion::V1, ApiVersion::V2, false, true), + (ApiVersion::V1ttl, ApiVersion::V2, false, true), + (ApiVersion::V1, ApiVersion::V1ttl, false, false), + (ApiVersion::V2, ApiVersion::V1, false, false), + (ApiVersion::V2, ApiVersion::V1ttl, false, false), + (ApiVersion::V1ttl, ApiVersion::V1, false, false), ]; - for test_case in test_backup_cases { + for (idx, (src_api, dst_api, test_ttl, result)) in test_backup_cases.into_iter().enumerate() + { assert_eq!( - test_handle_backup_raw_task_impl(test_case.0, test_case.1), - test_case.2 + test_handle_backup_raw_task_impl(src_api, dst_api, test_ttl), + result, + "case {}", + idx, ); } } + #[test] + fn test_backup_raw_apiv2_causal_ts() { + let limiter = Arc::new(IoRateLimiter::new_for_test()); + let ts_provider: Arc = + Arc::new(causal_ts::tests::TestProvider::default().into()); + let start_ts = block_on(ts_provider.async_get_ts()).unwrap(); + let (tmp, endpoint) = new_endpoint_with_limiter( + Some(limiter), + ApiVersion::V2, + true, + Some(ts_provider.clone()), + ); + + let mut req = BackupRequest::default(); + let (tx, _) = unbounded(); + let tmp1 = make_unique_dir(tmp.path()); + req.set_storage_backend(make_local_backend(&tmp1)); + req.set_start_key(b"r".to_vec()); + req.set_end_key(b"s".to_vec()); + req.set_is_raw_kv(true); + req.set_dst_api_version(ApiVersion::V2); + let (task, _) = Task::new(req, tx).unwrap(); + endpoint.handle_backup_task(task); + let end_ts = block_on(ts_provider.async_get_ts()).unwrap(); + assert_eq!(end_ts.into_inner(), start_ts.next().into_inner() + 101); + } + #[test] fn test_scan_error() { let (tmp, endpoint) = new_endpoint(); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint .region_info @@ -1720,7 +2256,7 @@ pub mod tests { let start = alloc_ts(); let key = format!("{}", start); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), key.as_bytes(), key.as_bytes(), @@ -1748,7 +2284,7 @@ pub mod tests { // Commit the perwrite. let commit = alloc_ts(); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); // Test whether it can correctly convert not leader to region error. engine.trigger_not_leader(); @@ -1774,7 +2310,7 @@ pub mod tests { #[test] fn test_cancel() { let (temp, mut endpoint) = new_endpoint(); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint .region_info @@ -1785,7 +2321,7 @@ pub mod tests { let start = alloc_ts(); let key = format!("{}", start); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), key.as_bytes(), key.as_bytes(), @@ -1793,7 +2329,7 @@ pub mod tests { ); // Commit the perwrite. let commit = alloc_ts(); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); let now = alloc_ts(); let mut req = BackupRequest::default(); @@ -1898,7 +2434,8 @@ pub mod tests { assert_eq!(responses.len(), 3, "{:?}", responses); // for testing whether dropping the pool before all tasks finished causes panic. - // but the panic must be checked manually... (It may panic at tokio runtime threads...) + // but the panic must be checked manually. (It may panic at tokio runtime + // threads) let mut pool = ControlThreadPool::new(); pool.adjust_with(1); pool.spawn(async { tokio::time::sleep(Duration::from_millis(100)).await }); @@ -1906,4 +2443,36 @@ pub mod tests { drop(pool); std::thread::sleep(Duration::from_millis(150)); } + + #[test] + fn test_backup_file_name() { + let region = metapb::Region::default(); + let store_id = 1; + let test_cases = vec!["s3", "local", "gcs", "azure", "hdfs"]; + let test_target = vec![ + "1/0_0_000", + "1/0_0_000", + "1_0_0_000", + "1_0_0_000", + "1_0_0_000", + ]; + + let delimiter = "_"; + for (storage_name, target) in test_cases.iter().zip(test_target.iter()) { + let key = Some(String::from("000")); + let filename = backup_file_name(store_id, ®ion, key, storage_name); + + let mut prefix_arr: Vec<&str> = filename.split(delimiter).collect(); + prefix_arr.remove(prefix_arr.len() - 1); + + assert_eq!(target.to_string(), prefix_arr.join(delimiter)); + } + + let test_target = vec!["1/0_0", "1/0_0", "1_0_0", "1_0_0", "1_0_0"]; + for (storage_name, target) in test_cases.iter().zip(test_target.iter()) { + let key = None; + let filename = backup_file_name(store_id, ®ion, key, storage_name); + assert_eq!(target.to_string(), filename); + } + } } diff --git a/components/backup/src/errors.rs b/components/backup/src/errors.rs index 4f290262c57..413f4ee77f9 100644 --- a/components/backup/src/errors.rs +++ b/components/backup/src/errors.rs @@ -24,7 +24,7 @@ impl From for ErrorPb { fn from(e: Error) -> ErrorPb { let mut err = ErrorPb::default(); match e { - Error::ClusterID { current, request } => { + Error::ClusterId { current, request } => { BACKUP_RANGE_ERROR_VEC .with_label_values(&["cluster_mismatch"]) .inc(); @@ -114,8 +114,8 @@ pub enum Error { EngineTrait(#[from] EngineTraitError), #[error("Transaction error {0}")] Txn(#[from] TxnError), - #[error("ClusterID error current {current}, request {request}")] - ClusterID { current: u64, request: u64 }, + #[error("ClusterId error current {current}, request {request}")] + ClusterId { current: u64, request: u64 }, #[error("Invalid cf {cf}")] InvalidCf { cf: String }, #[error("Failed to acquire the semaphore {0}")] diff --git a/components/backup/src/metrics.rs b/components/backup/src/metrics.rs index 2b92dc5b6b9..a24a1593e9f 100644 --- a/components/backup/src/metrics.rs +++ b/components/backup/src/metrics.rs @@ -58,4 +58,9 @@ lazy_static! { &["cf"], ) .unwrap(); + pub static ref BACKUP_RAW_EXPIRED_COUNT : IntCounter = register_int_counter!( + "tikv_backup_raw_expired_count", + "Total number of rawkv expired during scan", + ) + .unwrap(); } diff --git a/components/backup/src/service.rs b/components/backup/src/service.rs index 4d73dd0bb5f..237234c061e 100644 --- a/components/backup/src/service.rs +++ b/components/backup/src/service.rs @@ -2,27 +2,89 @@ use std::sync::atomic::*; +use engine_traits::{KvEngine, RaftEngine}; use futures::{channel::mpsc, FutureExt, SinkExt, StreamExt, TryFutureExt}; use grpcio::{self, *}; use kvproto::brpb::*; +use raftstore::store::{ + fsm::store::RaftRouter, + msg::{PeerMsg, SignificantMsg}, +}; use tikv_util::{error, info, worker::*}; use super::Task; /// Service handles the RPC messages for the `Backup` service. #[derive(Clone)] -pub struct Service { +pub struct Service { scheduler: Scheduler, + router: Option>, } -impl Service { - /// Create a new backup service. - pub fn new(scheduler: Scheduler) -> Service { - Service { scheduler } +impl Service +where + EK: KvEngine, + ER: RaftEngine, +{ + // Create a new backup service without router, this used for raftstore v2. + // because we don't have RaftStoreRouter any more. + pub fn new(scheduler: Scheduler) -> Self { + Service { + scheduler, + router: None, + } + } + + // Create a new backup service with router, this used for raftstore v1. + pub fn with_router(scheduler: Scheduler, router: RaftRouter) -> Self { + Service { + scheduler, + router: Some(router), + } } } -impl Backup for Service { +impl Backup for Service +where + EK: KvEngine, + ER: RaftEngine, +{ + fn check_pending_admin_op( + &mut self, + ctx: RpcContext<'_>, + _req: CheckAdminRequest, + mut sink: ServerStreamingSink, + ) { + let (tx, rx) = mpsc::unbounded(); + match &self.router { + Some(router) => { + router.broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + }); + let send_task = async move { + let mut s = rx.map(|resp| Ok((resp, WriteFlags::default()))); + sink.send_all(&mut s).await?; + sink.close().await?; + Ok(()) + } + .map(|res: Result<()>| match res { + Ok(_) => { + info!("check admin closed"); + } + Err(e) => { + error!("check admin canceled"; "error" => ?e); + } + }); + ctx.spawn(send_task); + } + None => { + // check pending admin reqeust is used for EBS Backup. + // for raftstore v2. we don't need it for now. so just return unimplemented + unimplemented_call!(ctx, sink) + } + } + } + fn backup( &mut self, ctx: RpcContext<'_>, @@ -81,6 +143,7 @@ impl Backup for Service { mod tests { use std::{sync::Arc, time::Duration}; + use engine_rocks::RocksEngine; use external_storage_export::make_local_backend; use tikv::storage::txn::tests::{must_commit, must_prewrite_put}; use tikv_util::worker::{dummy_scheduler, ReceiverWrapper}; @@ -92,7 +155,7 @@ mod tests { fn new_rpc_suite() -> (Server, BackupClient, ReceiverWrapper) { let env = Arc::new(EnvBuilder::new().build()); let (scheduler, rx) = dummy_scheduler(); - let backup_service = super::Service::new(scheduler); + let backup_service = super::Service::::new(scheduler); let builder = ServerBuilder::new(env.clone()).register_service(create_backup(backup_service)); let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); @@ -109,7 +172,7 @@ mod tests { let (_server, client, mut rx) = new_rpc_suite(); let (tmp, endpoint) = new_endpoint(); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint.region_info.set_regions(vec![ (b"".to_vec(), b"2".to_vec(), 1), (b"2".to_vec(), b"5".to_vec(), 2), @@ -121,14 +184,14 @@ mod tests { let start = alloc_ts(); let key = format!("{}", i); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), key.as_bytes(), key.as_bytes(), start, ); let commit = alloc_ts(); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); } let now = alloc_ts(); diff --git a/components/backup/src/softlimit.rs b/components/backup/src/softlimit.rs index babc13326bd..c3a2fc7c796 100644 --- a/components/backup/src/softlimit.rs +++ b/components/backup/src/softlimit.rs @@ -89,9 +89,10 @@ impl SoftLimit { pub trait CpuStatistics { type Container: IntoIterator; // ThreadInfoStatistics needs &mut self to record the thread information. - // RefCell(internal mutability) would make SoftLimitByCpu !Sync, hence futures contains it become !Send (WHY?) - // Mutex would make this function async or blocking. - // Anyway, &mut here is acceptable, since SoftLimitByCpu won't be shared. (Even the &mut here is a little weird...) + // RefCell(internal mutability) would make SoftLimitByCpu !Sync, hence futures + // contains it become !Send (WHY?) Mutex would make this function async or + // blocking. Anyway, &mut here is acceptable, since SoftLimitByCpu won't be + // shared. (Even the &mut here is a little weird...) fn get_cpu_usages(&mut self) -> Self::Container; } @@ -119,7 +120,8 @@ impl SoftLimitByCpu { self.current_idle_exclude(|_| false) } - /// returns the current idle processor, ignoring threads with name matches the predicate. + /// returns the current idle processor, ignoring threads with name matches + /// the predicate. fn current_idle_exclude(&mut self, mut exclude: impl FnMut(&str) -> bool) -> f64 { let usages = self.metrics.get_cpu_usages(); let used = usages @@ -129,15 +131,17 @@ impl SoftLimitByCpu { self.total_time - used } - /// apply the limit to the soft limit according to the current CPU remaining. + /// apply the limit to the soft limit according to the current CPU + /// remaining. #[cfg(test)] pub async fn exec_over(&mut self, limit: &SoftLimit) -> Result<()> { self.exec_over_with_exclude(limit, |_| false).await } - /// apply the limit to the soft limit according to the current CPU remaining. - /// when calculating the CPU usage, ignore threads with name matched by the exclude predicate. - /// This would keep at least one thread working. + /// apply the limit to the soft limit according to the current CPU + /// remaining. when calculating the CPU usage, ignore threads with name + /// matched by the exclude predicate. This would keep at least one + /// thread working. #[cfg(test)] pub async fn exec_over_with_exclude( &mut self, diff --git a/components/backup/src/utils.rs b/components/backup/src/utils.rs index 4d01631817c..41af72e83d3 100644 --- a/components/backup/src/utils.rs +++ b/components/backup/src/utils.rs @@ -3,20 +3,22 @@ use std::sync::Arc; use api_version::{dispatch_api_version, ApiV2, KeyMode, KvFormat}; -use file_system::IOType; +use file_system::IoType; use futures::Future; use kvproto::kvrpcpb::ApiVersion; -use tikv_util::error; +use tikv_util::{error, sys::thread::ThreadBuildWrapper}; use tokio::{io::Result as TokioResult, runtime::Runtime}; use txn_types::{Key, TimeStamp}; use crate::{metrics::*, Result}; -// BACKUP_V1_TO_V2_TS is used as causal timestamp to backup RawKV api version V1/V1Ttl data and save to V2 format. -// Use 1 other than 0 because 0 is not a acceptable value for causal timestamp. See api_version::ApiV2::is_valid_ts. +// BACKUP_V1_TO_V2_TS is used as causal timestamp to backup RawKV api version +// V1/V1Ttl data and save to V2 format. Use 1 other than 0 because 0 is not a +// acceptable value for causal timestamp. See api_version::ApiV2::is_valid_ts. pub const BACKUP_V1_TO_V2_TS: u64 = 1; /// DaemonRuntime is a "background" runtime, which contains "daemon" tasks: -/// any task spawn into it would run until finish even the runtime isn't referenced. +/// any task spawn into it would run until finish even the runtime isn't +/// referenced. pub struct DaemonRuntime(Option); impl DaemonRuntime { @@ -90,11 +92,11 @@ pub fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResu .thread_name(thread_name) .enable_io() .enable_time() - .on_thread_start(|| { + .after_start_wrapper(|| { tikv_alloc::add_thread_memory_accessor(); - file_system::set_io_type(IOType::Export); + file_system::set_io_type(IoType::Export); }) - .on_thread_stop(|| { + .before_stop_wrapper(|| { tikv_alloc::remove_thread_memory_accessor(); }) .worker_threads(thread_count) @@ -109,11 +111,12 @@ pub struct KeyValueCodec { } // Usage of the KeyValueCodec in backup process is as following: -// `new` -> `check_backup_api_version`, return false if not supported or input invalid. -// encode the backup range with `encode_backup_key` +// `new` -> `check_backup_api_version`, return false if not supported or input +// invalid. encode the backup range with `encode_backup_key` // In `backup_raw` process -> use `is_valid_raw_value` & // `convert_encoded_key_to_dst_version` & `convert_encoded_value_to_dst_version` -// In BackupResponse, call `decode_backup_key` & `convert_key_range_to_dst_version` +// In BackupResponse, call `decode_backup_key` & +// `convert_key_range_to_dst_version` impl KeyValueCodec { pub fn new(is_raw_kv: bool, cur_api_ver: ApiVersion, dst_api_ver: ApiVersion) -> Self { KeyValueCodec { @@ -141,18 +144,27 @@ impl KeyValueCodec { true } - // only the non-deleted, non-expired 'raw' key/value is valid. - pub fn is_valid_raw_value(&self, key: &[u8], value: &[u8], current_ts: u64) -> Result { + // only the non-deleted, non-expired 'raw' key/value is valid, return (is_valid, + // is_ttl_expired) + pub fn is_valid_raw_value( + &self, + key: &[u8], + value: &[u8], + current_ts: u64, + ) -> Result<(bool, bool)> { if !self.is_raw_kv { - return Ok(false); + return Ok((false, false)); } dispatch_api_version!(self.cur_api_ver, { let key_mode = API::parse_key_mode(key); if key_mode != KeyMode::Raw && key_mode != KeyMode::Unknown { - return Ok(false); + return Ok((false, false)); } let raw_value = API::decode_raw_value(value)?; - return Ok(raw_value.is_valid(current_ts)); + return Ok(( + raw_value.is_valid(current_ts), + raw_value.is_ttl_expired(current_ts), + )); }) } @@ -204,7 +216,8 @@ impl KeyValueCodec { }) } - // Input key is encoded key for rawkv apiv2 and txnkv. return the decode dst apiversion key. + // Input key is encoded key for rawkv apiv2 and txnkv. return the decode dst + // apiversion key. pub fn decode_backup_key(&self, key: Option) -> Result> { if key.is_none() { return Ok(vec![]); @@ -240,12 +253,14 @@ impl KeyValueCodec { &self, start_key: Vec, end_key: Vec, - ) -> (Vec, Vec) { + ) -> Result<(Vec, Vec)> { if !self.is_raw_kv { - return (start_key, end_key); + return Ok((start_key, end_key)); } dispatch_api_version!(self.dst_api_ver, { - API::convert_raw_user_key_range_version_from(self.cur_api_ver, start_key, end_key) + let (start, end) = + API::convert_raw_user_key_range_version_from(self.cur_api_ver, start_key, end_key)?; + Ok((start, end)) }) } } @@ -500,14 +515,14 @@ pub mod tests { ( ApiVersion::V1, ApiVersion::V2, - b"abc".to_vec(), - ApiV2::encode_raw_key_owned(b"rabc".to_vec(), ts), + [61, 62, 63].to_vec(), + ApiV2::encode_raw_key_owned([114, 0, 0, 0, 61, 62, 63].to_vec(), ts), ), ( ApiVersion::V1ttl, ApiVersion::V2, b"".to_vec(), - ApiV2::encode_raw_key_owned(b"r".to_vec(), ts), + ApiV2::encode_raw_key_owned([114, 0, 0, 0].to_vec(), ts), ), ]; @@ -526,6 +541,7 @@ pub mod tests { !codec .is_valid_raw_value(src_key, &deleted_encoded_value, 0) .unwrap() + .0 ); } for raw_value in &raw_values { @@ -550,4 +566,92 @@ pub mod tests { } } } + + #[test] + fn test_is_valid_raw_value() { + // api_version, key, value, expire_ts, is_delete, expect_valid, + // expect_ttl_expired + let test_cases = vec![ + ( + ApiVersion::V1, + b"m".to_vec(), + b"a".to_vec(), + 10_u64, + false, + true, + false, + ), + ( + ApiVersion::V2, + b"m".to_vec(), + b"a".to_vec(), + 10, + false, + false, + false, + ), + ( + ApiVersion::V1, + b"ra".to_vec(), + b"a".to_vec(), + 100, + true, + true, + false, + ), + ( + ApiVersion::V1ttl, + b"rz".to_vec(), + b"a".to_vec(), + 10, + true, + false, + true, + ), + ( + ApiVersion::V2, + b"ra".to_vec(), + b"a".to_vec(), + 10, + false, + false, + true, + ), + ( + ApiVersion::V2, + b"rz".to_vec(), + b"a".to_vec(), + 100, + true, + false, + false, + ), + ( + ApiVersion::V2, + b"rb".to_vec(), + b"a".to_vec(), + 100, + false, + true, + false, + ), + ]; + + for (idx, (api_ver, key, value, expire_ts, is_delete, expect_valid, expect_ttl_expire)) in + test_cases.into_iter().enumerate() + { + let codec = KeyValueCodec::new(true, api_ver, api_ver); + let raw_value = RawValue { + user_value: value.clone(), + expire_ts: Some(expire_ts), + is_delete, + }; + let encoded_value = + dispatch_api_version!(api_ver, API::encode_raw_value_owned(raw_value)); + let (is_valid, ttl_expired) = + codec.is_valid_raw_value(&key, &encoded_value, 20).unwrap(); + assert_eq!(is_valid, expect_valid, "case {}", idx); + assert_eq!(ttl_expired, expect_ttl_expire, "case {}", idx); + } + } } diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 8408fb7c002..715c4f68291 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -1,12 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Display, io::Read, sync::Arc}; +use std::{fmt::Display, io::Read}; use encryption::{EncrypterReader, Iv}; -use engine_rocks::{raw::DB, RocksEngine, RocksSstWriter, RocksSstWriterBuilder}; use engine_traits::{ - CfName, ExternalSstFileInfo, SstCompressionType, SstWriter, SstWriterBuilder, CF_DEFAULT, - CF_WRITE, + CfName, ExternalSstFileInfo, KvEngine, SstCompressionType, SstExt, SstWriter, SstWriterBuilder, + CF_DEFAULT, CF_WRITE, }; use external_storage_export::{ExternalStorage, UnpinReader}; use file_system::Sha256Reader; @@ -26,9 +25,8 @@ use crate::{backup_file_name, metrics::*, utils::KeyValueCodec, Error, Result}; #[derive(Debug, Clone, Copy)] /// CfNameWrap wraps the CfName type. -/// For removing the 'static lifetime bound in the async function, -/// which doesn't compile due to 'captures lifetime that does not appear in bounds' :(. -/// see https://github.com/rust-lang/rust/issues/63033 +/// For removing the 'static lifetime bound in the async function, which doesn't +/// compile due to 'captures lifetime that does not appear in bounds', see https://github.com/rust-lang/rust/issues/63033 /// FIXME: remove this. pub struct CfNameWrap(pub &'static str); @@ -50,16 +48,16 @@ impl From for CfName { } } -struct Writer { - writer: RocksSstWriter, +struct Writer { + writer: W, total_kvs: u64, total_bytes: u64, checksum: u64, digest: crc64fast::Digest, } -impl Writer { - fn new(writer: RocksSstWriter) -> Self { +impl Writer { + fn new(writer: W) -> Self { Writer { writer, total_kvs: 0, @@ -99,9 +97,7 @@ impl Writer { Ok(()) } - // FIXME: we cannot get sst_info in [save_and_build_file], which may cause the !Send type - // [RocksEnternalSstFileInfo] sent between threads. - fn finish_read(writer: RocksSstWriter) -> Result<(u64, impl Read)> { + fn finish_read(writer: W) -> Result<(u64, impl Read)> { let (sst_info, sst_reader) = writer.finish_read()?; Ok((sst_info.file_size(), sst_reader)) } @@ -164,28 +160,28 @@ impl Writer { } } -pub struct BackupWriterBuilder { +pub struct BackupWriterBuilder { store_id: u64, limiter: Limiter, region: Region, - db: Arc, + db: EK, compression_type: Option, compression_level: i32, sst_max_size: u64, cipher: CipherInfo, } -impl BackupWriterBuilder { +impl BackupWriterBuilder { pub fn new( store_id: u64, limiter: Limiter, region: Region, - db: Arc, + db: EK, compression_type: Option, compression_level: i32, sst_max_size: u64, cipher: CipherInfo, - ) -> BackupWriterBuilder { + ) -> BackupWriterBuilder { Self { store_id, limiter, @@ -198,10 +194,10 @@ impl BackupWriterBuilder { } } - pub fn build(&self, start_key: Vec) -> Result { + pub fn build(&self, start_key: Vec, storage_name: &str) -> Result> { let key = file_system::sha256(&start_key).ok().map(hex::encode); let store_id = self.store_id; - let name = backup_file_name(store_id, &self.region, key); + let name = backup_file_name(store_id, &self.region, key, storage_name); BackupWriter::new( self.db.clone(), &name, @@ -215,37 +211,37 @@ impl BackupWriterBuilder { } /// A writer writes txn entries into SST files. -pub struct BackupWriter { +pub struct BackupWriter { name: String, - default: Writer, - write: Writer, + default: Writer<::SstWriter>, + write: Writer<::SstWriter>, limiter: Limiter, sst_max_size: u64, cipher: CipherInfo, } -impl BackupWriter { +impl BackupWriter { /// Create a new BackupWriter. pub fn new( - db: Arc, + db: EK, name: &str, compression_type: Option, compression_level: i32, limiter: Limiter, sst_max_size: u64, cipher: CipherInfo, - ) -> Result { - let default = RocksSstWriterBuilder::new() + ) -> Result> { + let default = ::SstWriterBuilder::new() .set_in_memory(true) .set_cf(CF_DEFAULT) - .set_db(RocksEngine::from_ref(&db)) + .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; - let write = RocksSstWriterBuilder::new() + let write = ::SstWriterBuilder::new() .set_in_memory(true) .set_cf(CF_WRITE) - .set_db(RocksEngine::from_ref(&db)) + .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; @@ -339,19 +335,19 @@ impl BackupWriter { } /// A writer writes Raw kv into SST files. -pub struct BackupRawKvWriter { +pub struct BackupRawKvWriter { name: String, cf: CfName, - writer: Writer, + writer: Writer<::SstWriter>, limiter: Limiter, cipher: CipherInfo, codec: KeyValueCodec, } -impl BackupRawKvWriter { +impl BackupRawKvWriter { /// Create a new BackupRawKvWriter. pub fn new( - db: Arc, + db: EK, name: &str, cf: CfNameWrap, limiter: Limiter, @@ -359,11 +355,11 @@ impl BackupRawKvWriter { compression_level: i32, cipher: CipherInfo, codec: KeyValueCodec, - ) -> Result { - let writer = RocksSstWriterBuilder::new() + ) -> Result> { + let writer = ::SstWriterBuilder::new() .set_in_memory(true) .set_cf(cf.into()) - .set_db(RocksEngine::from_ref(&db)) + .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; @@ -431,9 +427,9 @@ mod tests { use engine_traits::Iterable; use kvproto::encryptionpb; - use raftstore::store::util::new_peer; use tempfile::TempDir; use tikv::storage::TestEngineBuilder; + use tikv_util::store::new_peer; use txn_types::OldValue; use super::*; @@ -444,7 +440,7 @@ mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[engine_traits::CF_DEFAULT, engine_traits::CF_WRITE]) + .cfs([engine_traits::CF_DEFAULT, engine_traits::CF_WRITE]) .build() .unwrap(); let db = rocks.get_rocksdb(); @@ -458,7 +454,7 @@ mod tests { } for (cf, kv) in kvs { let mut map = BTreeMap::new(); - db.scan_cf( + db.scan( cf, keys::DATA_MIN_KEY, keys::DATA_MAX_KEY, @@ -481,7 +477,7 @@ mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[ + .cfs([ engine_traits::CF_DEFAULT, engine_traits::CF_LOCK, engine_traits::CF_WRITE, @@ -498,7 +494,7 @@ mod tests { r.set_id(1); r.mut_peers().push(new_peer(1, 1)); let mut writer = BackupWriter::new( - db.get_sync_db(), + db.clone(), "foo", None, 0, @@ -516,7 +512,7 @@ mod tests { // Test write only txn. let mut writer = BackupWriter::new( - db.get_sync_db(), + db.clone(), "foo1", None, 0, @@ -555,7 +551,7 @@ mod tests { // Test write and default. let mut writer = BackupWriter::new( - db.get_sync_db(), + db, "foo2", None, 0, diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index 03aabafe3ae..af57bbef930 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -8,20 +8,22 @@ default = ["test-runner"] test-runner = ["derive_more"] [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } crossbeam = "0.8" derive_more = { version = "0.99", optional = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" -online_config = { path = "../online_config" } +online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +resource_control = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_alloc = { path = "../tikv_alloc", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +slog = { workspace = true } +slog-global = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] criterion = "0.3" diff --git a/components/batch-system/benches/batch-system.rs b/components/batch-system/benches/batch-system.rs index b4e3ffd03ac..9edf72f0ff9 100644 --- a/components/batch-system/benches/batch-system.rs +++ b/components/batch-system/benches/batch-system.rs @@ -20,7 +20,7 @@ fn end_hook(tx: &std::sync::mpsc::Sender<()>) -> Message { fn bench_spawn_many(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 32; const MESSAGE_LIMIT: usize = 256; @@ -55,7 +55,7 @@ fn bench_spawn_many(c: &mut Criterion) { fn bench_imbalance(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 10; const MESSAGE_LIMIT: usize = 512; @@ -85,14 +85,14 @@ fn bench_imbalance(c: &mut Criterion) { system.shutdown(); } -/// Bench how it performs when scheduling a lot of quick tasks during an long-polling -/// tasks. +/// Bench how it performs when scheduling a lot of quick tasks during an +/// long-polling tasks. /// /// A good scheduling algorithm should not starve the quick tasks. fn bench_fairness(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let state_cnt = Arc::new(AtomicUsize::new(0)); for id in 0..10 { diff --git a/components/batch-system/benches/router.rs b/components/batch-system/benches/router.rs index 3dd7e282e15..e25ee58b94d 100644 --- a/components/batch-system/benches/router.rs +++ b/components/batch-system/benches/router.rs @@ -8,7 +8,7 @@ use criterion::*; fn bench_send(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let (normal_tx, normal_fsm) = Runner::new(100000); let normal_box = BasicMailbox::new(normal_tx, normal_fsm, Arc::default()); diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 3f8d433aefd..19005ef2c43 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -1,9 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -//! This is the core implementation of a batch system. Generally there will be two -//! different kind of FSMs in TiKV's FSM system. One is normal FSM, which usually -//! represents a peer, the other is control FSM, which usually represents something -//! that controls how the former is created or metrics are collected. +//! This is the core implementation of a batch system. Generally there will be +//! two different kind of FSMs in TiKV's FSM system. One is normal FSM, which +//! usually represents a peer, the other is control FSM, which usually +//! represents something that controls how the former is created or metrics are +//! collected. // #[PerformanceCriticalPath] use std::{ @@ -14,16 +15,23 @@ use std::{ time::Duration, }; -use crossbeam::channel::{self, SendError}; use fail::fail_point; -use file_system::{set_io_type, IOType}; -use tikv_util::{debug, error, info, mpsc, safe_panic, thd_name, time::Instant, warn}; +use file_system::{set_io_type, IoType}; +use resource_control::{ + channel::{unbounded, Receiver, Sender}, + ResourceController, +}; +use tikv_util::{ + debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, + time::Instant, +}; use crate::{ config::Config, fsm::{Fsm, FsmScheduler, Priority}, mailbox::BasicMailbox, router::Router, + scheduler::{ControlScheduler, NormalScheduler}, }; /// A unify type for FSMs so that they can be sent to channel easily. @@ -33,60 +41,6 @@ pub enum FsmTypes { // Used as a signal that scheduler should be shutdown. Empty, } - -// A macro to introduce common definition of scheduler. -macro_rules! impl_sched { - ($name:ident, $ty:path, Fsm = $fsm:tt) => { - pub struct $name { - sender: channel::Sender>, - low_sender: channel::Sender>, - } - - impl Clone for $name { - #[inline] - fn clone(&self) -> $name { - $name { - sender: self.sender.clone(), - low_sender: self.low_sender.clone(), - } - } - } - - impl FsmScheduler for $name - where - $fsm: Fsm, - { - type Fsm = $fsm; - - #[inline] - fn schedule(&self, fsm: Box) { - let sender = match fsm.get_priority() { - Priority::Normal => &self.sender, - Priority::Low => &self.low_sender, - }; - match sender.send($ty(fsm)) { - Ok(()) => {} - // TODO: use debug instead. - Err(SendError($ty(fsm))) => warn!("failed to schedule fsm {:p}", fsm), - _ => unreachable!(), - } - } - - fn shutdown(&self) { - // TODO: close it explicitly once it's supported. - // Magic number, actually any number greater than poll pool size works. - for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty); - let _ = self.low_sender.send(FsmTypes::Empty); - } - } - } - }; -} - -impl_sched!(NormalScheduler, FsmTypes::Normal, Fsm = N); -impl_sched!(ControlScheduler, FsmTypes::Control, Fsm = C); - pub struct NormalFsm { fsm: Box, timer: Instant, @@ -128,7 +82,7 @@ pub struct Batch { } impl Batch { - /// Create a a batch with given batch size. + /// Creates a batch with given batch size. pub fn with_capacity(cap: usize) -> Batch { Batch { normals: Vec::with_capacity(cap), @@ -159,15 +113,16 @@ impl Batch { self.control.take(); } - /// Put back the FSM located at index. + /// Releases the ownership of `fsm` so that it can be scheduled in another + /// poller. /// - /// Only when channel length is larger than `checked_len` will trigger - /// further notification. This function may fail if channel length is - /// larger than the given value before FSM is released. - fn release(&mut self, mut fsm: NormalFsm, checked_len: usize) -> Option> { + /// When pending messages of the FSM is different than `expected_len`, + /// attempts to schedule it in this poller again. Returns the `fsm` if the + /// re-scheduling succeeds. + fn release(&mut self, mut fsm: NormalFsm, expected_len: usize) -> Option> { let mailbox = fsm.take_mailbox().unwrap(); mailbox.release(fsm.fsm); - if mailbox.len() == checked_len { + if mailbox.len() == expected_len { None } else { match mailbox.take_fsm() { @@ -182,7 +137,7 @@ impl Batch { } } - /// Remove the normal FSM located at `index`. + /// Removes the normal FSM. /// /// This method should only be called when the FSM is stopped. /// If there are still messages in channel, the FSM is untouched and @@ -200,17 +155,11 @@ impl Batch { } } - /// Schedule the normal FSM located at `index`. - /// - /// If `inplace`, the relative position of all fsm will not be changed; otherwise, the fsm - /// will be popped and the last fsm will be swap in to reduce memory copy. - pub fn schedule(&mut self, router: &BatchRouter, index: usize, inplace: bool) { + /// Schedules the normal FSM located at `index`. + pub fn schedule(&mut self, router: &BatchRouter, index: usize) { let to_schedule = match self.normals[index].take() { Some(f) => f, None => { - if !inplace { - self.normals.swap_remove(index); - } return; } }; @@ -227,12 +176,19 @@ impl Batch { // failed to reschedule f.policy.take(); self.normals[index] = res; - } else if !inplace { + } + } + + /// Reclaims the slot storage if there is no FSM located at `index`. It will + /// alter the positions of some other FSMs with index larger than `index`. + #[inline] + pub fn swap_reclaim(&mut self, index: usize) { + if self.normals[index].is_none() { self.normals.swap_remove(index); } } - /// Same as `release`, but working on control FSM. + /// Same as [`release`], but works with control FSM. pub fn release_control(&mut self, control_box: &BasicMailbox, checked_len: usize) -> bool { let s = self.control.take().unwrap(); control_box.release(s); @@ -249,7 +205,7 @@ impl Batch { } } - /// Same as `remove`, but working on control FSM. + /// Same as [`remove`], but works with control FSM. pub fn remove_control(&mut self, control_box: &BasicMailbox) { if control_box.is_empty() { let s = self.control.take().unwrap(); @@ -260,14 +216,14 @@ impl Batch { /// The result for `PollHandler::handle_control`. pub enum HandleResult { - /// The Fsm still needs to be processed. + /// The FSM still needs to be handled in the next run. KeepProcessing, - /// The Fsm should stop at the progress. + /// The FSM should stop at the progress. StopAt { - /// The count of messages that have been acknowledged by handler. The fsm should be - /// released until new messages arrive. + /// The amount of messages acknowledged by the handler. The FSM + /// should be released unless new messages arrive. progress: usize, - /// Whether the fsm should be released before `end`. + /// Whether the FSM should be passed in to `end` call. skip_end: bool, }, } @@ -279,9 +235,10 @@ impl HandleResult { } } -/// A handler that poll all FSM in ready. +/// A handler that polls all FSMs in ready. +/// +/// A general process works like the following: /// -/// A General process works like following: /// ```text /// loop { /// begin @@ -289,34 +246,34 @@ impl HandleResult { /// handle_control /// foreach ready normal: /// handle_normal +/// light_end /// end /// } /// ``` /// -/// Note that, every poll thread has its own handler, which doesn't have to be -/// Sync. +/// A [`PollHandler`] doesn't have to be [`Sync`] because each poll thread has +/// its own handler. pub trait PollHandler: Send + 'static { /// This function is called at the very beginning of every round. fn begin(&mut self, _batch_size: usize, update_cfg: F) where for<'a> F: FnOnce(&'a Config); - /// This function is called when handling readiness for control FSM. + /// This function is called when the control FSM is ready. + /// + /// If `Some(len)` is returned, this function will not be called again until + /// there are more than `len` pending messages in `control` FSM. /// - /// If returned value is Some, then it represents a length of channel. This - /// function will only be called for the same fsm after channel's lengh is - /// larger than the value. If it returns None, then this function will - /// still be called for the same FSM in the next loop unless the FSM is - /// stopped. + /// If `None` is returned, this function will be called again with the same + /// FSM `control` in the next round, unless it is stopped. fn handle_control(&mut self, control: &mut C) -> Option; - /// This function is called when handling readiness for normal FSM. - /// - /// The returned value is handled in the same way as `handle_control`. + /// This function is called when some normal FSMs are ready. fn handle_normal(&mut self, normal: &mut impl DerefMut) -> HandleResult; - /// This function is called after `handle_normal` is called for all fsm and before calling - /// `end`. The function is expected to run lightweight work. + /// This function is called after [`handle_normal`] is called for all FSMs + /// and before calling [`end`]. The function is expected to run lightweight + /// works. fn light_end(&mut self, _batch: &mut [Option>]) {} /// This function is called at the end of every round. @@ -334,7 +291,7 @@ pub trait PollHandler: Send + 'static { /// Internal poller that fetches batch and call handler hooks for readiness. pub struct Poller { pub router: Router, ControlScheduler>, - pub fsm_receiver: channel::Receiver>, + pub fsm_receiver: Receiver>, pub handler: Handler, pub max_batch_size: usize, pub reschedule_duration: Duration, @@ -378,7 +335,8 @@ impl> Poller { !batch.is_empty() } - // Poll for readiness and forward to handler. Remove stale peer if necessary. + /// Polls for readiness and forwards them to handler. Removes stale peers if + /// necessary. pub fn poll(&mut self) { fail_point!("poll"); let mut batch = Batch::with_capacity(self.max_batch_size); @@ -386,15 +344,16 @@ impl> Poller { let mut to_skip_end = Vec::with_capacity(self.max_batch_size); // Fetch batch after every round is finished. It's helpful to protect regions - // from becoming hungry if some regions are hot points. Since we fetch new fsm every time - // calling `poll`, we do not need to configure a large value for `self.max_batch_size`. + // from becoming hungry if some regions are hot points. Since we fetch new FSM + // every time calling `poll`, we do not need to configure a large value for + // `self.max_batch_size`. let mut run = true; while run && self.fetch_fsm(&mut batch) { - // If there is some region wait to be deal, we must deal with it even if it has overhead - // max size of batch. It's helpful to protect regions from becoming hungry - // if some regions are hot points. + // If there is some region wait to be deal, we must deal with it even if it has + // overhead max size of batch. It's helpful to protect regions from becoming + // hungry if some regions are hot points. let mut max_batch_size = std::cmp::max(self.max_batch_size, batch.normals.len()); - // update some online config if needed. + // Update some online config if needed. { // TODO: rust 2018 does not support capture disjoint field within a closure. // See https://github.com/rust-lang/rust/issues/53488 for more details. @@ -451,9 +410,11 @@ impl> Poller { if let Ok(fsm) = self.fsm_receiver.try_recv() { run = batch.push(fsm); } - // If we receive a ControlFsm, break this cycle and call `end`. Because ControlFsm - // may change state of the handler, we shall deal with it immediately after - // calling `begin` of `Handler`. + // When `fsm_cnt >= batch.normals.len()`: + // - No more FSMs in `fsm_receiver`. + // - We receive a control FSM. Break the loop because ControlFsm may change + // state of the handler, we shall deal with it immediately after calling + // `begin` of `Handler`. if !run || fsm_cnt >= batch.normals.len() { break; } @@ -472,17 +433,19 @@ impl> Poller { fsm_cnt += 1; } self.handler.light_end(&mut batch.normals); - for offset in &to_skip_end { - batch.schedule(&self.router, *offset, true); + for index in &to_skip_end { + batch.schedule(&self.router, *index); } to_skip_end.clear(); self.handler.end(&mut batch.normals); - // Because release use `swap_remove` internally, so using pop here - // to remove the correct FSM. - while let Some(r) = reschedule_fsms.pop() { - batch.schedule(&self.router, r, false); + // Iterate larger index first, so that `swap_reclaim` won't affect other FSMs + // in the list. + for index in reschedule_fsms.iter().rev() { + batch.schedule(&self.router, *index); + batch.swap_reclaim(*index); } + reschedule_fsms.clear(); } if let Some(fsm) = batch.control.take() { self.router.control_scheduler.schedule(fsm); @@ -515,14 +478,14 @@ pub trait HandlerBuilder { /// A system that can poll FSMs concurrently and in batch. /// -/// To use the system, two type of FSMs and their PollHandlers need -/// to be defined: Normal and Control. Normal FSM handles the general -/// task while Control FSM creates normal FSM instances. +/// To use the system, two type of FSMs and their PollHandlers need to be +/// defined: Normal and Control. Normal FSM handles the general task while +/// Control FSM creates normal FSM instances. pub struct BatchSystem { name_prefix: Option, router: BatchRouter, - receiver: channel::Receiver>, - low_receiver: channel::Receiver>, + receiver: Receiver>, + low_receiver: Receiver>, pool_size: usize, max_batch_size: usize, workers: Arc>>>, @@ -581,9 +544,9 @@ where let props = tikv_util::thread_group::current_properties(); let t = thread::Builder::new() .name(name) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); poller.poll(); }) .unwrap(); @@ -636,15 +599,15 @@ where } } -struct PoolStateBuilder { +struct PoolStateBuilder { max_batch_size: usize, reschedule_duration: Duration, - fsm_receiver: channel::Receiver>, - fsm_sender: channel::Sender>, + fsm_receiver: Receiver>, + fsm_sender: Sender>, pool_size: usize, } -impl PoolStateBuilder { +impl PoolStateBuilder { fn build>( self, name_prefix: String, @@ -670,11 +633,11 @@ impl PoolStateBuilder { } } -pub struct PoolState> { +pub struct PoolState> { pub name_prefix: String, pub handler_builder: H, - pub fsm_receiver: channel::Receiver>, - pub fsm_sender: channel::Sender>, + pub fsm_receiver: Receiver>, + pub fsm_sender: Sender>, pub low_priority_pool_size: usize, pub expected_pool_size: usize, pub workers: Arc>>>, @@ -688,37 +651,38 @@ pub type BatchRouter = Router, ControlSchedule /// Create a batch system with the given thread name prefix and pool size. /// -/// `sender` and `controller` should be paired. +/// `sender` and `controller` should be paired: all messages sent on the +/// `sender` will become available to the `controller`. pub fn create_system( cfg: &Config, sender: mpsc::LooseBoundedSender, controller: Box, + resource_ctl: Option>, ) -> (BatchRouter, BatchSystem) { let state_cnt = Arc::new(AtomicUsize::new(0)); let control_box = BasicMailbox::new(sender, controller, state_cnt.clone()); - let (tx, rx) = channel::unbounded(); - let (tx2, rx2) = channel::unbounded(); + let (sender, receiver) = unbounded(resource_ctl); + let (low_sender, low_receiver) = unbounded(None); // no resource control for low fsm let normal_scheduler = NormalScheduler { - sender: tx.clone(), - low_sender: tx2.clone(), + sender: sender.clone(), + low_sender, }; let control_scheduler = ControlScheduler { - sender: tx.clone(), - low_sender: tx2, + sender: sender.clone(), }; let pool_state_builder = PoolStateBuilder { max_batch_size: cfg.max_batch_size(), reschedule_duration: cfg.reschedule_duration.0, - fsm_receiver: rx.clone(), - fsm_sender: tx, + fsm_receiver: receiver.clone(), + fsm_sender: sender, pool_size: cfg.pool_size, }; let router = Router::new(control_box, normal_scheduler, control_scheduler, state_cnt); let system = BatchSystem { name_prefix: None, router: router.clone(), - receiver: rx, - low_receiver: rx2, + receiver, + low_receiver, pool_size: cfg.pool_size, max_batch_size: cfg.max_batch_size(), workers: Arc::new(Mutex::new(Vec::new())), diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index cee3a7b4020..3fa5ad15a64 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -10,46 +10,47 @@ use std::{ usize, }; -use crate::mailbox::BasicMailbox; +use resource_control::ResourceMetered; -// The FSM is notified. -const NOTIFYSTATE_NOTIFIED: usize = 0; -// The FSM is idle. -const NOTIFYSTATE_IDLE: usize = 1; -// The FSM is expected to be dropped. -const NOTIFYSTATE_DROP: usize = 2; +use crate::mailbox::BasicMailbox; -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub enum Priority { Low, Normal, } -/// `FsmScheduler` schedules `Fsm` for later handles. +/// `FsmScheduler` schedules `Fsm` for later handling. pub trait FsmScheduler { type Fsm: Fsm; - /// Schedule a Fsm for later handles. + /// Schedule a Fsm for later handling. fn schedule(&self, fsm: Box); + /// Shutdown the scheduler, which indicates that resources like /// background thread pool should be released. fn shutdown(&self); + + /// Consume the resources of msg in resource controller if enabled, + /// otherwise do nothing. + fn consume_msg_resource(&self, msg: &::Message); } -/// A Fsm is a finite state machine. It should be able to be notified for +/// A `Fsm` is a finite state machine. It should be able to be notified for /// updating internal state according to incoming messages. -pub trait Fsm { - type Message: Send; +pub trait Fsm: Send + 'static { + type Message: Send + ResourceMetered; fn is_stopped(&self) -> bool; - /// Set a mailbox to Fsm, which should be used to send message to itself. + /// Set a mailbox to FSM, which should be used to send message to itself. fn set_mailbox(&mut self, _mailbox: Cow<'_, BasicMailbox>) where Self: Sized, { } - /// Take the mailbox from Fsm. Implementation should ensure there will be + + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. fn take_mailbox(&mut self) -> Option> where @@ -63,17 +64,30 @@ pub trait Fsm { } } +/// A holder of FSM. +/// +/// There are three possible states: +/// +/// 1. NOTIFYSTATE_NOTIFIED: The FSM is taken by an external executor. `data` +/// holds a null pointer. +/// 2. NOTIFYSTATE_IDLE: No actor is using the FSM. `data` owns the FSM. +/// 3. NOTIFYSTATE_DROP: The FSM is dropped. `data` holds a null pointer. pub struct FsmState { status: AtomicUsize, data: AtomicPtr, + /// A counter shared with other `FsmState`s. state_cnt: Arc, } impl FsmState { + const NOTIFYSTATE_NOTIFIED: usize = 0; + const NOTIFYSTATE_IDLE: usize = 1; + const NOTIFYSTATE_DROP: usize = 2; + pub fn new(data: Box, state_cnt: Arc) -> FsmState { state_cnt.fetch_add(1, Ordering::Relaxed); FsmState { - status: AtomicUsize::new(NOTIFYSTATE_IDLE), + status: AtomicUsize::new(Self::NOTIFYSTATE_IDLE), data: AtomicPtr::new(Box::into_raw(data)), state_cnt, } @@ -82,8 +96,8 @@ impl FsmState { /// Take the fsm if it's IDLE. pub fn take_fsm(&self) -> Option> { let res = self.status.compare_exchange( - NOTIFYSTATE_IDLE, - NOTIFYSTATE_NOTIFIED, + Self::NOTIFYSTATE_IDLE, + Self::NOTIFYSTATE_NOTIFIED, Ordering::AcqRel, Ordering::Acquire, ); @@ -99,7 +113,7 @@ impl FsmState { } } - /// Notify fsm via a `FsmScheduler`. + /// Notifies FSM via a `FsmScheduler`. #[inline] pub fn notify>( &self, @@ -115,25 +129,25 @@ impl FsmState { } } - /// Put the owner back to the state. + /// Releases the FSM ownership back to this state. /// /// It's not required that all messages should be consumed before - /// releasing a fsm. However, a fsm is guaranteed to be notified only + /// releasing a FSM. However, a FSM is guaranteed to be notified only /// when new messages arrives after it's released. #[inline] pub fn release(&self, fsm: Box) { let previous = self.data.swap(Box::into_raw(fsm), Ordering::AcqRel); - let mut previous_status = NOTIFYSTATE_NOTIFIED; + let mut previous_status = Self::NOTIFYSTATE_NOTIFIED; if previous.is_null() { let res = self.status.compare_exchange( - NOTIFYSTATE_NOTIFIED, - NOTIFYSTATE_IDLE, + Self::NOTIFYSTATE_NOTIFIED, + Self::NOTIFYSTATE_IDLE, Ordering::AcqRel, Ordering::Acquire, ); previous_status = match res { Ok(_) => return, - Err(NOTIFYSTATE_DROP) => { + Err(Self::NOTIFYSTATE_DROP) => { let ptr = self.data.swap(ptr::null_mut(), Ordering::AcqRel); unsafe { Box::from_raw(ptr) }; return; @@ -144,18 +158,18 @@ impl FsmState { panic!("invalid release state: {:?} {}", previous, previous_status); } - /// Clear the fsm. + /// Clears the FSM. #[inline] pub fn clear(&self) { - match self.status.swap(NOTIFYSTATE_DROP, Ordering::AcqRel) { - NOTIFYSTATE_NOTIFIED | NOTIFYSTATE_DROP => return, + match self.status.swap(Self::NOTIFYSTATE_DROP, Ordering::AcqRel) { + Self::NOTIFYSTATE_NOTIFIED | Self::NOTIFYSTATE_DROP => return, _ => {} } let ptr = self.data.swap(ptr::null_mut(), Ordering::SeqCst); if !ptr.is_null() { unsafe { - Box::from_raw(ptr); + let _ = Box::from_raw(ptr); } } } diff --git a/components/batch-system/src/lib.rs b/components/batch-system/src/lib.rs index 9ca2953972d..2e59d42808c 100644 --- a/components/batch-system/src/lib.rs +++ b/components/batch-system/src/lib.rs @@ -6,6 +6,7 @@ mod fsm; mod mailbox; mod metrics; mod router; +mod scheduler; #[cfg(feature = "test-runner")] pub mod test_runner; @@ -16,7 +17,7 @@ pub use self::{ PollHandler, Poller, PoolState, }, config::Config, - fsm::{Fsm, Priority}, + fsm::{Fsm, FsmScheduler, Priority}, mailbox::{BasicMailbox, Mailbox}, router::Router, }; diff --git a/components/batch-system/src/mailbox.rs b/components/batch-system/src/mailbox.rs index 219edb2e2af..869031392af 100644 --- a/components/batch-system/src/mailbox.rs +++ b/components/batch-system/src/mailbox.rs @@ -13,12 +13,21 @@ use crate::fsm::{Fsm, FsmScheduler, FsmState}; /// A basic mailbox. /// -/// Every mailbox should have one and only one owner, who will receive all -/// messages sent to this mailbox. +/// A mailbox holds an FSM owner, and the sending end of a channel to send +/// messages to that owner. Multiple producers share the same mailbox to +/// communicate with a FSM. /// -/// When a message is sent to a mailbox, its owner will be checked whether it's -/// idle. An idle owner will be scheduled via `FsmScheduler` immediately, which -/// will drive the fsm to poll for messages. +/// The mailbox's FSM owner needs to be scheduled to a [`Poller`] to handle its +/// pending messages. Therefore, the producer of messages also needs to provide +/// a channel to a poller ([`FsmScheduler`]), so that the mailbox can schedule +/// its FSM owner. When a message is sent to a mailbox, the mailbox will check +/// whether its FSM owner is idle, i.e. not already taken and scheduled. If the +/// FSM is idle, it will be scheduled immediately. By doing so, the mailbox +/// temporarily transfers its ownership of the FSM to the poller. The +/// implementation must make sure the same FSM is returned afterwards via the +/// [`release`] method. +/// +/// [`Poller`]: crate::batch::Poller pub struct BasicMailbox { sender: mpsc::LooseBoundedSender, state: Arc>, @@ -66,6 +75,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), SendError> { + scheduler.consume_msg_resource(&msg); self.sender.force_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) @@ -80,6 +90,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), TrySendError> { + scheduler.consume_msg_resource(&msg); self.sender.try_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) @@ -103,7 +114,7 @@ impl Clone for BasicMailbox { } } -/// A more high level mailbox. +/// A more high level mailbox that is paired with a [`FsmScheduler`]. pub struct Mailbox where Owner: Fsm, diff --git a/components/batch-system/src/metrics.rs b/components/batch-system/src/metrics.rs index 9edcd656bf4..a4728f32ad7 100644 --- a/components/batch-system/src/metrics.rs +++ b/components/batch-system/src/metrics.rs @@ -10,4 +10,11 @@ lazy_static! { &["type"] ) .unwrap(); + + pub static ref BROADCAST_NORMAL_DURATION: Histogram = + register_histogram!( + "tikv_broadcast_normal_duration_seconds", + "Duration of broadcasting normals.", + exponential_buckets(0.001, 1.59, 20).unwrap() // max 10s + ).unwrap(); } diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 43067ecb202..119b7875506 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -12,12 +12,12 @@ use std::{ use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{debug, info, lru::LruCache, Either}; +use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; use crate::{ fsm::{Fsm, FsmScheduler, FsmState}, mailbox::{BasicMailbox, Mailbox}, - metrics::CHANNEL_FULL_COUNTER_VEC, + metrics::*, }; /// A struct that traces the approximate memory usage of router. @@ -39,17 +39,20 @@ enum CheckDoResult { Valid(T), } -/// Router route messages to its target mailbox. -/// -/// Every fsm has a mailbox, hence it's necessary to have an address book -/// that can deliver messages to specified fsm, which is exact router. +/// Router routes messages to its target FSM's mailbox. /// /// In our abstract model, every batch system has two different kind of -/// fsms. First is normal fsm, which does the common work like peers in a -/// raftstore model or apply delegate in apply model. Second is control fsm, +/// FSMs. First is normal FSM, which does the common work like peers in a +/// raftstore model or apply delegate in apply model. Second is control FSM, /// which does some work that requires a global view of resources or creates -/// missing fsm for specified address. Normal fsm and control fsm can have -/// different scheduler, but this is not required. +/// missing FSM for specified address. +/// +/// There are one control FSM and multiple normal FSMs in a system. Each FSM +/// has its own mailbox. We maintain an address book to deliver messages to the +/// specified normal FSM. +/// +/// Normal FSM and control FSM can have different scheduler, but this is not +/// required. pub struct Router { normals: Arc>>, caches: Cell>>, @@ -60,8 +63,9 @@ pub struct Router { pub(crate) normal_scheduler: Ns, pub(crate) control_scheduler: Cs, - // Count of Mailboxes that is not destroyed. - // Added when a Mailbox created, and subtracted it when a Mailbox destroyed. + // Number of active mailboxes. + // Added when a mailbox is created, and subtracted it when a mailbox is + // destroyed. state_cnt: Arc, // Indicates the router is shutdown down or not. shutdown: Arc, @@ -170,6 +174,30 @@ where .store(normals.map.len(), Ordering::Relaxed); } + /// Same as send a message and then register the mailbox. + /// + /// The mailbox will not be registered if the message can't be sent. + pub fn send_and_register( + &self, + addr: u64, + mailbox: BasicMailbox, + msg: N::Message, + ) -> Result<(), (BasicMailbox, N::Message)> { + let mut normals = self.normals.lock().unwrap(); + // Send has to be done within lock, otherwise the message may be handled + // before the mailbox is register. + if let Err(SendError(m)) = mailbox.force_send(msg, &self.normal_scheduler) { + return Err((mailbox, m)); + } + if let Some(mailbox) = normals.map.insert(addr, mailbox) { + mailbox.close(); + } + normals + .alive_cnt + .store(normals.map.len(), Ordering::Relaxed); + Ok(()) + } + pub fn register_all(&self, mailboxes: Vec<(u64, BasicMailbox)>) { let mut normals = self.normals.lock().unwrap(); normals.map.reserve(mailboxes.len()); @@ -198,7 +226,7 @@ where } } - /// Get the mailbox of control fsm. + /// Get the mailbox of control FSM. pub fn control_mailbox(&self) -> Mailbox { Mailbox::new(self.control_box.clone(), self.control_scheduler.clone()) } @@ -269,7 +297,7 @@ where } } - /// Force sending message to control fsm. + /// Sending message to control FSM. #[inline] pub fn send_control(&self, msg: C::Message) -> Result<(), TrySendError> { match self.control_box.try_send(msg, &self.control_scheduler) { @@ -284,15 +312,23 @@ where } } - /// Try to notify all normal fsm a message. + /// Force sending message to control FSM. + #[inline] + pub fn force_send_control(&self, msg: C::Message) -> Result<(), SendError> { + self.control_box.force_send(msg, &self.control_scheduler) + } + + /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { + let timer = Instant::now_coarse(); let mailboxes = self.normals.lock().unwrap(); for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } + BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } - /// Try to notify all fsm that the cluster is being shutdown. + /// Try to notify all FSMs that the cluster is being shutdown. pub fn broadcast_shutdown(&self) { info!("broadcasting shutdown"); self.shutdown.store(true, Ordering::SeqCst); @@ -309,7 +345,7 @@ where /// Close the mailbox of address. pub fn close(&self, addr: u64) { - info!("[region {}] shutdown mailbox", addr); + info!("shutdown mailbox"; "region_id" => addr); unsafe { &mut *self.caches.as_ptr() }.remove(&addr); let mut mailboxes = self.normals.lock().unwrap(); if let Some(mb) = mailboxes.map.remove(&addr) { @@ -346,8 +382,8 @@ where let state_unit = mem::size_of::>(); // Every message in crossbeam sender needs 8 bytes to store state. let message_unit = mem::size_of::() + 8; - // crossbeam unbounded channel sender has a list of blocks. Every block has 31 unit - // and every sender has at least one sender. + // crossbeam unbounded channel sender has a list of blocks. Every block has 31 + // unit and every sender has at least one sender. let sender_block_unit = 31; RouterTrace { alive: (mailbox_unit * 8 / 7 // hashmap uses 7/8 of allocated memory. diff --git a/components/batch-system/src/scheduler.rs b/components/batch-system/src/scheduler.rs new file mode 100644 index 00000000000..723863249fb --- /dev/null +++ b/components/batch-system/src/scheduler.rs @@ -0,0 +1,105 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use crossbeam::channel::SendError; +use resource_control::channel::Sender; +use tikv_util::warn; + +use crate::{ + fsm::{Fsm, FsmScheduler, Priority}, + FsmTypes, +}; +pub struct NormalScheduler { + pub(crate) sender: Sender>, + pub(crate) low_sender: Sender>, +} + +impl Clone for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + NormalScheduler { + sender: self.sender.clone(), + low_sender: self.low_sender.clone(), + } + } +} + +impl FsmScheduler for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = N; + + fn consume_msg_resource(&self, msg: &::Message) { + self.sender.consume_msg_resource(msg); + } + + #[inline] + fn schedule(&self, fsm: Box) { + let sender = match fsm.get_priority() { + Priority::Normal => &self.sender, + Priority::Low => &self.low_sender, + }; + + match sender.send(FsmTypes::Normal(fsm), None) { + Ok(_) => {} + Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty, None); + let _ = self.low_sender.send(FsmTypes::Empty, None); + } + } +} + +pub struct ControlScheduler { + pub(crate) sender: Sender>, +} + +impl Clone for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + ControlScheduler { + sender: self.sender.clone(), + } + } +} + +impl FsmScheduler for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = C; + + fn consume_msg_resource(&self, _msg: &::Message) {} + + #[inline] + fn schedule(&self, fsm: Box) { + match self.sender.send(FsmTypes::Control(fsm), None) { + Ok(_) => {} + Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty, None); + } + } +} diff --git a/components/batch-system/src/test_runner.rs b/components/batch-system/src/test_runner.rs index 6be64d5d695..ad9c3f54d04 100644 --- a/components/batch-system/src/test_runner.rs +++ b/components/batch-system/src/test_runner.rs @@ -12,6 +12,7 @@ use std::{ }; use derive_more::{Add, AddAssign}; +use resource_control::{ResourceConsumeType, ResourceController, ResourceMetered}; use tikv_util::mpsc; use crate::*; @@ -22,6 +23,20 @@ pub enum Message { Loop(usize), /// `Runner` will call the callback directly. Callback(Box), + /// group name, write bytes + Resource(String, u64), +} + +impl ResourceMetered for Message { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { + match self { + Message::Resource(group_name, bytes) => { + resource_ctl.consume(group_name.as_bytes(), ResourceConsumeType::IoBytes(*bytes)); + Some(group_name.to_owned()) + } + _ => None, + } + } } /// A simple runner used for benchmarking only. @@ -102,6 +117,7 @@ impl Handler { } } Ok(Message::Callback(cb)) => cb(self, r), + Ok(Message::Resource(..)) => {} Err(_) => break, } } diff --git a/components/batch-system/tests/cases/batch.rs b/components/batch-system/tests/cases/batch.rs index f950df68b8d..dc13affc363 100644 --- a/components/batch-system/tests/cases/batch.rs +++ b/components/batch-system/tests/cases/batch.rs @@ -7,13 +7,15 @@ use std::{ }; use batch_system::{test_runner::*, *}; +use kvproto::resource_manager::{GroupMode, GroupRawResourceSettings, ResourceGroup}; +use resource_control::ResourceGroupManager; use tikv_util::mpsc; #[test] fn test_batch() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); let metrics = builder.metrics.clone(); system.spawn("test".to_owned(), builder); @@ -55,7 +57,7 @@ fn test_batch() { fn test_priority() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); let (tx, rx) = mpsc::unbounded(); @@ -101,3 +103,102 @@ fn test_priority() { .unwrap(); assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(3)); } + +#[test] +fn test_resource_group() { + let (control_tx, control_fsm) = Runner::new(10); + let resource_manager = ResourceGroupManager::default(); + + let get_group = |name: &str, read_tokens: u64, write_tokens: u64| -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name.to_string()); + group.set_mode(GroupMode::RawMode); + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + group + }; + + resource_manager.add_resource_group(get_group("group1", 10, 10)); + resource_manager.add_resource_group(get_group("group2", 100, 100)); + + let mut cfg = Config::default(); + cfg.pool_size = 1; + let (router, mut system) = batch_system::create_system( + &cfg, + control_tx, + control_fsm, + Some(resource_manager.derive_controller("test".to_string(), false)), + ); + let builder = Builder::new(); + system.spawn("test".to_owned(), builder); + let (tx, rx) = mpsc::unbounded(); + let tx_ = tx.clone(); + let r = router.clone(); + let state_cnt = Arc::new(AtomicUsize::new(0)); + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + let (tx, runner) = Runner::new(10); + r.register(1, BasicMailbox::new(tx, runner, state_cnt.clone())); + let (tx2, runner2) = Runner::new(10); + r.register(2, BasicMailbox::new(tx2, runner2, state_cnt)); + tx_.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + let tx_ = tx.clone(); + let (tx1, rx1) = std::sync::mpsc::sync_channel(0); + // block the thread + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + tx_.send(0).unwrap(); + tx1.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + router + .send(1, Message::Resource("group1".to_string(), 1)) + .unwrap(); + let tx_ = tx.clone(); + router + .send( + 1, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx_.send(1).unwrap(); + })), + ) + .unwrap(); + + router + .send(2, Message::Resource("group2".to_string(), 1)) + .unwrap(); + router + .send( + 2, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx.send(2).unwrap(); + })), + ) + .unwrap(); + + // pause the blocking thread + assert_eq!(rx1.recv_timeout(Duration::from_secs(3)), Ok(0)); + + // should recv from group2 first, because group2 has more tokens and it would be + // handled with higher priority. + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(2)); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(1)); +} diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index 543937fa8ef..d746dfad5cb 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -30,7 +30,7 @@ fn test_basic() { let (control_drop_tx, control_drop_rx) = mpsc::unbounded(); control_fsm.sender = Some(control_drop_tx); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); @@ -130,7 +130,7 @@ fn test_basic() { fn test_router_trace() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index 08027941f03..71af0419a68 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -4,30 +4,41 @@ version = "0.0.1" edition = "2018" publish = false +[features] +testexport = [] + [dependencies] -api_version = { path = "../api_version", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } +api_version = { workspace = true } +async-trait = { version = "0.1" } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +enum_dispatch = "0.3.8" +error_code = { workspace = true } fail = "0.5" futures = { version = "0.3" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } parking_lot = "0.12" -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } +prometheus-static-metric = "0.5" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } +test_pd_client = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } [dev-dependencies] -test_raftstore = { path = "../test_raftstore", default-features = false } +criterion = "0.3" + +[[bench]] +name = "tso" +path = "benches/tso.rs" +harness = false diff --git a/components/causal_ts/benches/tso.rs b/components/causal_ts/benches/tso.rs new file mode 100644 index 00000000000..f7e1980d15f --- /dev/null +++ b/components/causal_ts/benches/tso.rs @@ -0,0 +1,119 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use causal_ts::{BatchTsoProvider, CausalTsProvider, TsoBatchList}; +use criterion::*; +use futures::executor::block_on; +use test_pd_client::TestPdClient; +use txn_types::TimeStamp; + +fn bench_batch_tso_list_pop(c: &mut Criterion) { + const CAPACITY: u64 = 10_000; + let cases = vec![("100", 100), ("10k", 10_000)]; // (id, batch_size) + + let bench_func = |b: &mut Bencher<'_>, batch_size: u64| { + let batch_list = TsoBatchList::new(CAPACITY as u32); + b.iter_batched( + || { + batch_list.flush(); + for i in 0..CAPACITY { + batch_list + .push(batch_size as u32, TimeStamp::compose(i, batch_size), false) + .unwrap(); + } + }, + |_| { + black_box(batch_list.pop(None).unwrap()); + }, + BatchSize::NumIterations(CAPACITY * batch_size), + ) + }; + + let mut group = c.benchmark_group("batch_tso_list_pop"); + for (id, batch_size) in cases { + group.bench_function(id, |b| { + bench_func(b, batch_size); + }); + } +} + +fn bench_batch_tso_list_push(c: &mut Criterion) { + const BATCH_SIZE: u64 = 8192; + let cases = vec![("50", 50), ("1024", 1024)]; // (id, capacity) + + let bench_func = |b: &mut Bencher<'_>, capacity: u64| { + let batch_list = TsoBatchList::new(capacity as u32); + let mut i = 0; + b.iter(|| { + i += 1; + black_box( + batch_list + .push( + BATCH_SIZE as u32, + TimeStamp::compose(i as u64, BATCH_SIZE), + false, + ) + .unwrap(), + ); + }) + }; + + let mut group = c.benchmark_group("batch_tso_list_push"); + for (id, capacity) in cases { + group.bench_function(id, |b| { + bench_func(b, capacity); + }); + } +} + +fn bench_batch_tso_provider_get_ts(c: &mut Criterion) { + let pd_cli = Arc::new(TestPdClient::new(1, false)); + + // Disable background renew by setting `renew_interval` to 0 to make test result + // stable. + let provider = block_on(BatchTsoProvider::new_opt( + pd_cli, + Duration::ZERO, + Duration::from_secs(1), // cache_multiplier = 10 + 100, + 80000, + )) + .unwrap(); + + c.bench_function("bench_batch_tso_provider_get_ts", |b| { + b.iter(|| { + black_box(block_on(provider.async_get_ts()).unwrap()); + }) + }); +} + +fn bench_batch_tso_provider_flush(c: &mut Criterion) { + let pd_cli = Arc::new(TestPdClient::new(1, false)); + + // Disable background renew by setting `renew_interval` to 0 to make test result + // stable. + let provider = block_on(BatchTsoProvider::new_opt( + pd_cli, + Duration::ZERO, + Duration::from_secs(1), // cache_multiplier = 10 + 100, + 80000, + )) + .unwrap(); + + c.bench_function("bench_batch_tso_provider_flush", |b| { + b.iter(|| { + black_box(block_on(provider.async_flush())).unwrap(); + }) + }); +} + +criterion_group!( + benches, + bench_batch_tso_list_pop, + bench_batch_tso_list_push, + bench_batch_tso_provider_get_ts, + bench_batch_tso_provider_flush, +); +criterion_main!(benches); diff --git a/components/causal_ts/src/config.rs b/components/causal_ts/src/config.rs index a856b5b7358..17994344924 100644 --- a/components/causal_ts/src/config.rs +++ b/components/causal_ts/src/config.rs @@ -16,18 +16,41 @@ pub struct Config { /// The minimal renew batch size of BatchTsoProvider. /// /// Default is 100. - /// One TSO is required for every batch of Raft put messages, so by default 1K tso/s should be enough. - /// Benchmark showed that with a 8.6w raw_put per second, the TSO requirement is 600 per second. + /// One TSO is required for every batch of Raft put messages, so by default + /// 1K tso/s should be enough. Benchmark showed that with a 8.6w raw_put + /// per second, the TSO requirement is 600 per second. pub renew_batch_min_size: u32, + /// The maximum renew batch size of BatchTsoProvider. + /// + /// Default is 8192. + /// PD provides 262144 TSO per 50ms for the whole cluster. Exceed this space + /// will cause PD to sleep for 50ms, waiting for physical update + /// interval. The 50ms limitation can not be broken through now (see + /// `tso-update-physical-interval`). + pub renew_batch_max_size: u32, + /// The size (in duration) of TSO buffer allocated ahead for + /// BatchTsoProvider. + /// + /// Default is 3s. + /// The longer of the value will help to improve tolerance against PD + /// failure, but more overhead of `TsoBatchList` & pressure to TSO + /// service. + pub alloc_ahead_buffer: ReadableDuration, } impl Config { pub fn validate(&self) -> Result<(), Box> { if self.renew_interval.is_zero() { - return Err("causal-ts.renew_interval can't be zero".into()); + return Err("causal-ts.renew-interval can't be zero".into()); } if self.renew_batch_min_size == 0 { - return Err("causal-ts.renew_batch_init_size should be greater than 0".into()); + return Err("causal-ts.renew-batch-min-size should be greater than 0".into()); + } + if self.renew_batch_max_size == 0 { + return Err("causal-ts.renew-batch-max-size should be greater than 0".into()); + } + if self.alloc_ahead_buffer.is_zero() { + return Err("causal-ts.alloc-ahead-buffer can't be zero".into()); } Ok(()) } @@ -36,8 +59,14 @@ impl Config { impl Default for Config { fn default() -> Self { Self { - renew_interval: ReadableDuration::millis(crate::tso::TSO_BATCH_RENEW_INTERVAL_DEFAULT), - renew_batch_min_size: crate::tso::TSO_BATCH_MIN_SIZE_DEFAULT, + renew_interval: ReadableDuration::millis( + crate::tso::DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS, + ), + renew_batch_min_size: crate::tso::DEFAULT_TSO_BATCH_MIN_SIZE, + renew_batch_max_size: crate::tso::DEFAULT_TSO_BATCH_MAX_SIZE, + alloc_ahead_buffer: ReadableDuration::millis( + crate::tso::DEFAULT_TSO_BATCH_ALLOC_AHEAD_BUFFER_MS, + ), } } } diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index 3507dc17926..ab57fbf734f 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -1,5 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(div_duration)] + #[macro_use] extern crate tikv_util; @@ -10,22 +12,32 @@ pub use errors::*; mod tso; pub use tso::*; mod metrics; +use async_trait::async_trait; +use enum_dispatch::enum_dispatch; pub use metrics::*; -mod observer; -pub use observer::*; +#[cfg(any(test, feature = "testexport"))] +use test_pd_client::TestPdClient; use txn_types::TimeStamp; -use crate::errors::Result; - +pub use crate::errors::Result; /// Trait of causal timestamp provider. +#[async_trait] +#[enum_dispatch] pub trait CausalTsProvider: Send + Sync { /// Get a new timestamp. - fn get_ts(&self) -> Result; + async fn async_get_ts(&self) -> Result; - /// Flush (cached) timestamps to keep causality on some events, such as "leader transfer". - fn flush(&self) -> Result<()> { - Ok(()) - } + /// Flush (cached) timestamps and return first timestamp to keep causality + /// on some events, such as "leader transfer". + async fn async_flush(&self) -> Result; +} + +#[enum_dispatch(CausalTsProvider)] +pub enum CausalTsProviderImpl { + BatchTsoProvider(BatchTsoProvider), + #[cfg(any(test, feature = "testexport"))] + BatchTsoProviderTest(BatchTsoProvider), + TestProvider(tests::TestProvider), } pub mod tests { @@ -37,6 +49,7 @@ pub mod tests { use super::*; /// for TEST purpose. + #[derive(Clone)] pub struct TestProvider { ts: Arc, } @@ -50,9 +63,17 @@ pub mod tests { } } + #[async_trait] impl CausalTsProvider for TestProvider { - fn get_ts(&self) -> Result { + async fn async_get_ts(&self) -> Result { Ok(self.ts.fetch_add(1, Ordering::Relaxed).into()) } + + // This is used for unit test. Add 100 from current. + // Do not modify this value as several test cases depend on it. + async fn async_flush(&self) -> Result { + self.ts.fetch_add(100, Ordering::Relaxed); + self.async_get_ts().await + } } } diff --git a/components/causal_ts/src/metrics.rs b/components/causal_ts/src/metrics.rs index 072f7325dc0..52f352ccfe5 100644 --- a/components/causal_ts/src/metrics.rs +++ b/components/causal_ts/src/metrics.rs @@ -2,6 +2,7 @@ use lazy_static::*; use prometheus::*; +use prometheus_static_metric::*; lazy_static! { pub static ref TS_PROVIDER_TSO_BATCH_SIZE: IntGauge = register_int_gauge!( @@ -20,7 +21,65 @@ lazy_static! { "tikv_causal_ts_provider_tso_batch_renew_duration_seconds", "Histogram of the duration of TSO batch renew", &["result", "reason"], - exponential_buckets(1e-6, 2.0, 20).unwrap() // 1us ~ 1s + exponential_buckets(1e-4, 2.0, 20).unwrap() // 0.1ms ~ 104s ) .unwrap(); + pub static ref TS_PROVIDER_TSO_BATCH_LIST_COUNTING: HistogramVec = register_histogram_vec!( + "tikv_causal_ts_provider_tso_batch_list_counting", + "Histogram of TSO batch list counting", + &["type"], + exponential_buckets(10.0, 2.0, 20).unwrap() // 10 ~ 10,000,000 + ) + .unwrap(); +} + +make_auto_flush_static_metric! { + pub label_enum TsoBatchRenewReason { + init, + background, + used_up, + flush, + } + + pub label_enum TsoBatchCountingKind { + tso_usage, + tso_remain, + new_batch_size, + } + + pub label_enum ResultKind { + ok, + err, + } + + pub struct TsProviderGetTsDurationVec: LocalHistogram { + "result" => ResultKind, + } + + pub struct TsoBatchRenewDurationVec: LocalHistogram { + "result" => ResultKind, + "reason" => TsoBatchRenewReason, + } + + pub struct TsoBatchListCountingVec: LocalHistogram { + "type" => TsoBatchCountingKind, + } +} + +impl From<&std::result::Result> for ResultKind { + #[inline] + fn from(res: &std::result::Result) -> Self { + if res.is_ok() { Self::ok } else { Self::err } + } +} + +lazy_static! { + pub static ref TS_PROVIDER_GET_TS_DURATION_STATIC: TsProviderGetTsDurationVec = + auto_flush_from!(TS_PROVIDER_GET_TS_DURATION, TsProviderGetTsDurationVec); + pub static ref TS_PROVIDER_TSO_BATCH_RENEW_DURATION_STATIC: TsoBatchRenewDurationVec = auto_flush_from!( + TS_PROVIDER_TSO_BATCH_RENEW_DURATION, + TsoBatchRenewDurationVec + ); + pub static ref TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC: TsoBatchListCountingVec = + auto_flush_from!(TS_PROVIDER_TSO_BATCH_LIST_COUNTING, TsoBatchListCountingVec); } diff --git a/components/causal_ts/src/observer.rs b/components/causal_ts/src/observer.rs deleted file mode 100644 index c89d480eddd..00000000000 --- a/components/causal_ts/src/observer.rs +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use api_version::{ApiV2, KeyMode, KvFormat}; -use engine_traits::KvEngine; -use kvproto::{ - metapb::Region, - raft_cmdpb::{CmdType, Request as RaftRequest}, -}; -use raft::StateRole; -use raftstore::{ - coprocessor, - coprocessor::{ - BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, Coprocessor, CoprocessorHost, - ObserverContext, QueryObserver, RegionChangeEvent, RegionChangeObserver, - RegionChangeReason, RoleChange, RoleObserver, - }, -}; - -use crate::CausalTsProvider; - -/// CausalObserver appends timestamp for RawKV V2 data, -/// and invoke causal_ts_provider.flush() on specified event, e.g. leader transfer, snapshot apply. -/// Should be used ONLY when API v2 is enabled. -pub struct CausalObserver { - causal_ts_provider: Arc, -} - -impl Clone for CausalObserver { - fn clone(&self) -> Self { - Self { - causal_ts_provider: self.causal_ts_provider.clone(), - } - } -} - -// Causal observer's priority should be higher than all other observers, to avoid being bypassed. -const CAUSAL_OBSERVER_PRIORITY: u32 = 0; - -impl CausalObserver { - pub fn new(causal_ts_provider: Arc) -> Self { - Self { causal_ts_provider } - } - - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - coprocessor_host.registry.register_query_observer( - CAUSAL_OBSERVER_PRIORITY, - BoxQueryObserver::new(self.clone()), - ); - coprocessor_host - .registry - .register_role_observer(CAUSAL_OBSERVER_PRIORITY, BoxRoleObserver::new(self.clone())); - coprocessor_host.registry.register_region_change_observer( - CAUSAL_OBSERVER_PRIORITY, - BoxRegionChangeObserver::new(self.clone()), - ); - } -} - -const REASON_LEADER_TRANSFER: &str = "leader_transfer"; -const REASON_REGION_MERGE: &str = "region_merge"; - -impl CausalObserver { - fn flush_timestamp(&self, region: &Region, reason: &'static str) { - fail::fail_point!("causal_observer_flush_timestamp", |_| ()); - - if let Err(err) = self.causal_ts_provider.flush() { - warn!("CausalObserver::flush_timestamp error"; "error" => ?err, "region_id" => region.get_id(), "region" => ?region, "reason" => reason); - } else { - debug!("CausalObserver::flush_timestamp succeed"; "region_id" => region.get_id(), "region" => ?region, "reason" => reason); - } - } -} - -impl Coprocessor for CausalObserver {} - -impl QueryObserver for CausalObserver { - fn pre_propose_query( - &self, - ctx: &mut ObserverContext<'_>, - requests: &mut Vec, - ) -> coprocessor::Result<()> { - let region_id = ctx.region().get_id(); - let mut ts = None; - - for req in requests.iter_mut().filter(|r| { - r.get_cmd_type() == CmdType::Put - && ApiV2::parse_key_mode(r.get_put().get_key()) == KeyMode::Raw - }) { - if ts.is_none() { - ts = Some(self.causal_ts_provider.get_ts().map_err(|err| { - coprocessor::Error::Other(box_err!("Get causal timestamp error: {:?}", err)) - })?); - } - - ApiV2::append_ts_on_encoded_bytes(req.mut_put().mut_key(), ts.unwrap()); - trace!("CausalObserver::pre_propose_query, append_ts"; "region_id" => region_id, - "key" => &log_wrappers::Value::key(req.get_put().get_key()), "ts" => ?ts.unwrap()); - } - Ok(()) - } -} - -impl RoleObserver for CausalObserver { - /// Observe becoming leader, to flush CausalTsProvider. - fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { - // In scenario of frequent leader transfer, the observing of change from - // follower to leader by `on_role_change` would be later than the real role - // change in raft state and adjacent write commands. - // This would lead to the late of flush, and violate causality. See issue #12498. - // So we observe role change to Candidate to fix this issue. - // Also note that when there is only one peer, it would become leader directly. - if role_change.state == StateRole::Candidate - || (ctx.region().peers.len() == 1 && role_change.state == StateRole::Leader) - { - self.flush_timestamp(ctx.region(), REASON_LEADER_TRANSFER); - } - } -} - -impl RegionChangeObserver for CausalObserver { - fn on_region_changed( - &self, - ctx: &mut ObserverContext<'_>, - event: RegionChangeEvent, - role: StateRole, - ) { - if role != StateRole::Leader { - return; - } - - // In the scenario of region merge, the target region would merge some entries from source - // region with larger timestamps (when leader of source region is in another store with - // larger TSO batch than the store of target region's leader). - // So we need a flush after commit merge. See issue #12680. - // TODO: do not need flush if leaders of source & target region are in the same store. - if let RegionChangeEvent::Update(RegionChangeReason::CommitMerge) = event { - self.flush_timestamp(ctx.region(), REASON_REGION_MERGE); - } - } -} - -#[cfg(test)] -pub mod tests { - use std::{mem, sync::Arc, time::Duration}; - - use api_version::{ApiV2, KvFormat}; - use futures::executor::block_on; - use kvproto::{ - metapb::Region, - raft_cmdpb::{RaftCmdRequest, Request as RaftRequest}, - }; - use test_raftstore::TestPdClient; - use txn_types::{Key, TimeStamp}; - - use super::*; - use crate::BatchTsoProvider; - - fn init() -> CausalObserver> { - let pd_cli = Arc::new(TestPdClient::new(0, true)); - pd_cli.set_tso(100.into()); - let causal_ts_provider = - Arc::new(block_on(BatchTsoProvider::new_opt(pd_cli, Duration::ZERO, 100)).unwrap()); - CausalObserver::new(causal_ts_provider) - } - - #[test] - fn test_causal_observer() { - let testcases: Vec<&[&[u8]]> = vec![ - &[b"r\0a", b"r\0b"], - &[b"r\0c"], - &[b"r\0d", b"r\0e", b"r\0f"], - ]; - - let ob = init(); - let mut region = Region::default(); - region.set_id(1); - let mut ctx = ObserverContext::new(®ion); - - for (i, keys) in testcases.into_iter().enumerate() { - let mut cmd_req = RaftCmdRequest::default(); - - for key in keys { - let key = ApiV2::encode_raw_key(key, None); - let value = b"value".to_vec(); - let mut req = RaftRequest::default(); - req.set_cmd_type(CmdType::Put); - req.mut_put().set_key(key.into_encoded()); - req.mut_put().set_value(value); - - cmd_req.mut_requests().push(req); - } - - let query = cmd_req.mut_requests(); - let mut vec_query: Vec = mem::take(query).into(); - ob.pre_propose_query(&mut ctx, &mut vec_query).unwrap(); - *query = vec_query.into(); - - for req in cmd_req.get_requests() { - let key = Key::from_encoded_slice(req.get_put().get_key()); - let (_, ts) = ApiV2::decode_raw_key_owned(key, true).unwrap(); - assert_eq!(ts, Some(TimeStamp::from(i as u64 + 101))); - } - } - } -} diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 917353222fa..51f1824f7a6 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -1,13 +1,37 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! ## The algorithm to make the TSO cache tolerate failure of TSO service +//! +//! 1. The expected total size (in duration) of TSO cache is specified by +//! config item `causal-ts.alloc-ahead-buffer`. +//! +//! 2. Count usage of TSO on every renew interval. +//! +//! 3. Calculate `cache_multiplier` by `causal-ts.alloc-ahead-buffer / +//! causal-ts.renew-interval`. +//! +//! 4. Then `tso_usage x cache_multiplier` is the expected number of TSO should +//! be cached. +//! +//! 5. And `tso_usage x cache_multiplier - tso_remain` is the expected number of +//! TSO to be requested from TSO service (if it's not a flush). +//! +//! Others: +//! * `cache_multiplier` is also used as capacity of TSO batch list, as we +//! append an item to the list on every renew. + use std::{ + borrow::Borrow, + collections::BTreeMap, error, result, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}, Arc, }, }; +use async_trait::async_trait; +#[cfg(test)] use futures::executor::block_on; use parking_lot::RwLock; use pd_client::PdClient; @@ -28,93 +52,232 @@ use crate::{ CausalTsProvider, }; -// Renew on every 100ms, to adjust batch size rapidly enough. -pub(crate) const TSO_BATCH_RENEW_INTERVAL_DEFAULT: u64 = 100; -// Batch size on every renew interval. -// One TSO is required for every batch of Raft put messages, so by default 1K tso/s should be enough. -// Benchmark showed that with a 8.6w raw_put per second, the TSO requirement is 600 per second. -pub(crate) const TSO_BATCH_MIN_SIZE_DEFAULT: u32 = 100; -// Max batch size of TSO requests. Space of logical timestamp is 262144, -// exceed this space will cause PD to sleep, waiting for physical clock advance. -const TSO_BATCH_MAX_SIZE: u32 = 20_0000; - -const TSO_BATCH_RENEW_ON_INITIALIZE: &str = "init"; -const TSO_BATCH_RENEW_BY_BACKGROUND: &str = "background"; -const TSO_BATCH_RENEW_FOR_USED_UP: &str = "used-up"; -const TSO_BATCH_RENEW_FOR_FLUSH: &str = "flush"; +/// Renew on every 100ms, to adjust batch size rapidly enough. +pub(crate) const DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS: u64 = 100; +/// Minimal batch size of TSO requests. This is an empirical value. +pub(crate) const DEFAULT_TSO_BATCH_MIN_SIZE: u32 = 100; +/// Maximum batch size of TSO requests. +/// As PD provides 262144 TSO per 50ms, conservatively set to 1/16 of 262144. +/// Exceed this space will cause PD to sleep for 50ms, waiting for physical +/// update interval. The 50ms limitation can not be broken through now (see +/// `tso-update-physical-interval`). +pub(crate) const DEFAULT_TSO_BATCH_MAX_SIZE: u32 = 8192; +/// Maximum available interval of TSO cache. +/// It means the duration that TSO we cache would be available despite failure +/// of PD. The longer of the value can provide better "High-Availability" +/// against PD failure, but more overhead of `TsoBatchList` & pressure to TSO +/// service. +pub(crate) const DEFAULT_TSO_BATCH_ALLOC_AHEAD_BUFFER_MS: u64 = 3000; +/// Just a limitation for safety, in case user specify a too big +/// `alloc_ahead_buffer`. +const MAX_TSO_BATCH_LIST_CAPACITY: u32 = 1024; /// TSO range: [(physical, logical_start), (physical, logical_end)) -#[derive(Default, Debug)] +#[derive(Debug)] struct TsoBatch { - size: u32, physical: u64, + logical_start: u64, logical_end: u64, // exclusive - logical_start: AtomicU64, + // current valid logical_tso offset, alloc_offset >= logical_end means + // the batch is exhausted. + alloc_offset: AtomicU64, } impl TsoBatch { - pub fn pop(&self) -> Option { - let mut logical = self.logical_start.load(Ordering::Relaxed); - while logical < self.logical_end { - match self.logical_start.compare_exchange_weak( - logical, - logical + 1, - Ordering::Relaxed, - Ordering::Relaxed, - ) { - Ok(_) => return Some(TimeStamp::compose(self.physical, logical)), - Err(x) => logical = x, - } + pub fn pop(&self) -> Option<(TimeStamp, bool /* is_used_up */)> { + // alloc_offset might be far bigger than logical_end if the concurrency is + // *very* high, but it won't overflow in practice, so no need to do an + // extra load check here. + let ts = self.alloc_offset.fetch_add(1, Ordering::Relaxed); + if ts < self.logical_end { + return Some(( + TimeStamp::compose(self.physical, ts), + ts + 1 == self.logical_end, + )); } None } // `last_ts` is the last timestamp of the new batch. - pub fn renew(&mut self, batch_size: u32, last_ts: TimeStamp) -> Result<()> { - let (physical, logical) = (last_ts.physical(), last_ts.logical() + 1); - let logical_start = logical.checked_sub(batch_size as u64).unwrap(); + pub fn new(batch_size: u32, last_ts: TimeStamp) -> Self { + let (physical, logical_end) = (last_ts.physical(), last_ts.logical() + 1); + let logical_start = logical_end.checked_sub(batch_size as u64).unwrap(); + + Self { + physical, + logical_start, + logical_end, + alloc_offset: AtomicU64::new(logical_start), + } + } + + /// Number of remaining (available) TSO in the batch. + pub fn remain(&self) -> u32 { + self.logical_end + .saturating_sub(self.alloc_offset.load(Ordering::Relaxed)) as u32 + } + + /// The original start timestamp in the batch. + pub fn original_start(&self) -> TimeStamp { + TimeStamp::compose(self.physical, self.logical_start) + } + + /// The excluded end timestamp after the last in batch. + pub fn excluded_end(&self) -> TimeStamp { + TimeStamp::compose(self.physical, self.logical_end) + } +} + +/// `TsoBatchList` is a ordered list of `TsoBatch`. It aims to: +/// +/// 1. Cache more number of TSO to improve high availability. See issue #12794. +/// `TsoBatch` can only cache at most 262144 TSO as logical clock is 18 bits. +/// +/// 2. Fully utilize cached TSO when some regions require latest TSO (e.g. in +/// the scenario of leader transfer). Other regions without the requirement can +/// still use older TSO cache. +#[derive(Default, Debug)] +pub struct TsoBatchList { + inner: RwLock, + + /// Number of remaining (available) TSO. + /// Using signed integer for avoiding a wrap around huge value as it's not + /// precisely counted. + tso_remain: AtomicI32, + + /// Statistics of TSO usage. + tso_usage: AtomicU32, + + /// Length of batch list. It is used to limit size for efficiency, and keep + /// batches fresh. + capacity: u32, +} + +/// Inner data structure of batch list. +/// The reasons why `crossbeam_skiplist::SkipMap` is not chosen: +/// +/// 1. In `flush()` procedure, a reader of `SkipMap` can still acquire a batch +/// after the it is removed, which would violate the causality requirement. +/// The `RwLock` avoid this scenario by lock synchronization. +/// +/// 2. It is a scenario with much more reads than writes. The `RwLock` would not +/// be less efficient than lock free implementation. +type TsoBatchListInner = BTreeMap; + +impl TsoBatchList { + pub fn new(capacity: u32) -> Self { + Self { + capacity: std::cmp::min(capacity, MAX_TSO_BATCH_LIST_CAPACITY), + ..Default::default() + } + } + + pub fn remain(&self) -> u32 { + std::cmp::max(self.tso_remain.load(Ordering::Relaxed), 0) as u32 + } + + pub fn usage(&self) -> u32 { + self.tso_usage.load(Ordering::Relaxed) + } + + pub fn take_and_report_usage(&self) -> u32 { + let usage = self.tso_usage.swap(0, Ordering::Relaxed); + TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC + .tso_usage + .observe(usage as f64); + usage + } + + fn remove_batch(&self, key: u64) { + if let Some(batch) = self.inner.write().remove(&key) { + self.tso_remain + .fetch_sub(batch.remain() as i32, Ordering::Relaxed); + } + } + + /// Pop timestamp. + /// When `after_ts.is_some()`, it will pop timestamp larger that `after_ts`. + /// It is used for the scenario that some regions have causality + /// requirement (e.g. after transfer, the next timestamp of new leader + /// should be larger than the store where it is transferred from). + /// `after_ts` is included. + pub fn pop(&self, after_ts: Option) -> Option { + let inner = self.inner.read(); + let range = match after_ts { + Some(after_ts) => inner.range(&after_ts.into_inner()..), + None => inner.range(..), + }; + for (key, batch) in range { + if let Some((ts, is_used_up)) = batch.pop() { + let key = *key; + drop(inner); + self.tso_usage.fetch_add(1, Ordering::Relaxed); + self.tso_remain.fetch_sub(1, Ordering::Relaxed); + if is_used_up { + // Note: do NOT try to make it async. + // According to benchmark, `remove_batch` can be done in ~50ns, while async + // implemented by `Worker` costs ~1us. + self.remove_batch(key); + } + return Some(ts); + } + } + None + } + + pub fn push(&self, batch_size: u32, last_ts: TimeStamp, need_flush: bool) -> Result { + let new_batch = TsoBatch::new(batch_size, last_ts); + + if let Some((_, last_batch)) = self.inner.read().iter().next_back() { + if new_batch.original_start() < last_batch.excluded_end() { + error!("timestamp fall back"; "batch_size" => batch_size, "last_ts" => ?last_ts, + "last_batch" => ?last_batch, "new_batch" => ?new_batch); + return Err(box_err!("timestamp fall back")); + } + } - if physical < self.physical - || (physical == self.physical && logical_start < self.logical_end) + let key = new_batch.original_start().into_inner(); { - error!("timestamp fall back"; "last_ts" => ?last_ts, "batch" => ?self, - "physical" => physical, "logical" => logical, "logical_start" => logical_start); - return Err(box_err!("timestamp fall back")); + // Hold the write lock until new batch is inserted. + // Otherwise a `pop()` would acquire the lock, meet no TSO available, and invoke + // renew request. + let mut inner = self.inner.write(); + if need_flush { + self.flush_internal(&mut inner); + } + + inner.insert(key, new_batch); + self.tso_remain + .fetch_add(batch_size as i32, Ordering::Relaxed); } - self.size = batch_size; - self.physical = physical; - self.logical_end = logical; - self.logical_start.store(logical_start, Ordering::Relaxed); - Ok(()) + // Remove items out of capacity limitation. + // Note: do NOT try to make it async. + // According to benchmark, `write().pop_first()` can be done in ~50ns, while + // async implemented by `Worker` costs ~1us. + if self.inner.read().len() > self.capacity as usize { + if let Some((_, batch)) = self.inner.write().pop_first() { + self.tso_remain + .fetch_sub(batch.remain() as i32, Ordering::Relaxed); + } + } + + Ok(key) + } + + fn flush_internal(&self, inner: &mut TsoBatchListInner) { + inner.clear(); + self.tso_remain.store(0, Ordering::Relaxed); } - // Note: batch is "used up" in flush, and batch size will be enlarged in next renew. pub fn flush(&self) { - self.logical_start - .store(self.logical_end, Ordering::Relaxed); - } - - // Return None if TsoBatch is empty. - // Note that `logical_start` will be larger than `logical_end`. See `pop()`. - pub fn used_size(&self) -> Option { - if self.size > 0 { - Some( - self.size - .checked_sub( - self.logical_end - .saturating_sub(self.logical_start.load(Ordering::Relaxed)) - as u32, - ) - .unwrap(), - ) - } else { - None - } + let mut inner = self.inner.write(); + self.flush_internal(&mut inner); } } -/// MAX_RENEW_BATCH_SIZE is the batch size of TSO renew. It is an empirical value. +/// MAX_RENEW_BATCH_SIZE is the batch size of TSO renew. It is an empirical +/// value. const MAX_RENEW_BATCH_SIZE: usize = 64; type RenewError = Arc; @@ -125,53 +288,92 @@ struct RenewRequest { sender: oneshot::Sender, } +#[derive(Clone, Copy, Debug)] +struct RenewParameter { + batch_min_size: u32, + batch_max_size: u32, + // `cache_multiplier` indicates that times on usage of TSO it should cache. + // It is also used as capacity of `TsoBatchList`. + cache_multiplier: u32, +} + pub struct BatchTsoProvider { pd_client: Arc, - batch: Arc>, - batch_min_size: u32, + batch_list: Arc, causal_ts_worker: Worker, renew_interval: Duration, - renew_request_tx: mpsc::Sender, + renew_parameter: RenewParameter, + renew_request_tx: Sender, +} + +impl std::fmt::Debug for BatchTsoProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BatchTsoProvider") + .field("batch_list", &self.batch_list) + .field("renew_interval", &self.renew_interval) + .field("renew_parameter", &self.renew_parameter) + .finish() + } } impl BatchTsoProvider { pub async fn new(pd_client: Arc) -> Result { Self::new_opt( pd_client, - Duration::from_millis(TSO_BATCH_RENEW_INTERVAL_DEFAULT), - TSO_BATCH_MIN_SIZE_DEFAULT, + Duration::from_millis(DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS), + Duration::from_millis(DEFAULT_TSO_BATCH_ALLOC_AHEAD_BUFFER_MS), + DEFAULT_TSO_BATCH_MIN_SIZE, + DEFAULT_TSO_BATCH_MAX_SIZE, ) .await } + #[allow(unused_mut)] + fn calc_cache_multiplier(mut renew_interval: Duration, alloc_ahead: Duration) -> u32 { + #[cfg(any(test, feature = "testexport"))] + if renew_interval.is_zero() { + // Should happen in test only. + renew_interval = Duration::from_millis(DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS); + } + alloc_ahead.div_duration_f64(renew_interval).ceil() as u32 + } + pub async fn new_opt( pd_client: Arc, renew_interval: Duration, + alloc_ahead: Duration, batch_min_size: u32, + batch_max_size: u32, ) -> Result { + let cache_multiplier = Self::calc_cache_multiplier(renew_interval, alloc_ahead); + let renew_parameter = RenewParameter { + batch_min_size, + batch_max_size, + cache_multiplier, + }; let (renew_request_tx, renew_request_rx) = mpsc::channel(MAX_RENEW_BATCH_SIZE); let s = Self { pd_client: pd_client.clone(), - batch: Arc::new(RwLock::new(TsoBatch::default())), - batch_min_size, - causal_ts_worker: WorkerBuilder::new("causal_ts_batch_tso_worker").create(), + batch_list: Arc::new(TsoBatchList::new(cache_multiplier)), + causal_ts_worker: WorkerBuilder::new("causal-ts-batch-tso-worker").create(), renew_interval, + renew_parameter, renew_request_tx, }; s.init(renew_request_rx).await?; Ok(s) } - async fn renew_tso_batch(&self, need_flush: bool, reason: &str) -> Result<()> { + async fn renew_tso_batch(&self, need_flush: bool, reason: TsoBatchRenewReason) -> Result<()> { Self::renew_tso_batch_internal(self.renew_request_tx.clone(), need_flush, reason).await } async fn renew_tso_batch_internal( renew_request_tx: Sender, need_flush: bool, - reason: &str, + reason: TsoBatchRenewReason, ) -> Result<()> { - let start = Instant::now(); + let start = Instant::now_coarse(); let (request, response) = oneshot::channel(); renew_request_tx .send(RenewRequest { @@ -185,60 +387,70 @@ impl BatchTsoProvider { .map_err(|_| box_err!("renew response channel is dropped")) .and_then(|r| r.map_err(|err| Error::BatchRenew(err))); - let label = if res.is_ok() { "ok" } else { "err" }; - TS_PROVIDER_TSO_BATCH_RENEW_DURATION - .with_label_values(&[label, reason]) + TS_PROVIDER_TSO_BATCH_RENEW_DURATION_STATIC + .get(res.borrow().into()) + .get(reason) .observe(start.saturating_elapsed_secs()); res } async fn renew_tso_batch_impl( pd_client: Arc, - tso_batch: Arc>, - batch_min_size: u32, + tso_batch_list: Arc, + renew_parameter: RenewParameter, need_flush: bool, ) -> Result<()> { - let new_batch_size = { - let batch = tso_batch.read(); - match batch.used_size() { - None => batch_min_size, - Some(used_size) => { - debug!("CachedTsoProvider::renew_tso_batch"; "batch before" => ?batch, "need_flush" => need_flush, "used size" => used_size); - Self::calc_new_batch_size(batch.size, used_size, batch_min_size) - } - } - }; - - match pd_client.batch_get_tso(new_batch_size).await { + let tso_remain = tso_batch_list.remain(); + let new_batch_size = + Self::calc_new_batch_size(tso_batch_list.clone(), renew_parameter, need_flush); + + TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC + .tso_remain + .observe(tso_remain as f64); + TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC + .new_batch_size + .observe(new_batch_size as f64); + + let res = match pd_client.batch_get_tso(new_batch_size).await { Err(err) => { - warn!("BatchTsoProvider::renew_tso_batch, pd_client.batch_get_tso error"; "error" => ?err, "need_flash" => need_flush); + warn!("BatchTsoProvider::renew_tso_batch, pd_client.batch_get_tso error"; + "new_batch_size" => new_batch_size, "error" => ?err, "need_flash" => need_flush); if need_flush { - let batch = tso_batch.write(); - batch.flush(); + tso_batch_list.flush(); } Err(err.into()) } Ok(ts) => { - { - let mut batch = tso_batch.write(); - batch.renew(new_batch_size, ts).map_err(|e| { + tso_batch_list + .push(new_batch_size, ts, need_flush) + .map_err(|e| { if need_flush { - batch.flush(); + tso_batch_list.flush(); } e })?; - debug!("BatchTsoProvider::renew_tso_batch"; "batch renew" => ?batch, "ts" => ?ts); - } - TS_PROVIDER_TSO_BATCH_SIZE.set(new_batch_size as i64); + debug!("BatchTsoProvider::renew_tso_batch"; + "tso_batch_list.remain" => tso_batch_list.remain(), "ts" => ?ts); + + // Should only be invoked after successful renew. Otherwise the TSO usage will + // be lost, and batch size requirement will be less than expected. Note that + // invoked here is not precise. There would be `get_ts()` before here after + // above `tso_batch_list.push()`, and make `tso_usage` a little bigger. This + // error is acceptable. + tso_batch_list.take_and_report_usage(); + Ok(()) } - } + }; + let total_batch_size = tso_batch_list.remain() + tso_batch_list.usage(); + TS_PROVIDER_TSO_BATCH_SIZE.set(total_batch_size as i64); + res } async fn renew_thread( pd_client: Arc, - tso_batch: Arc>, - batch_min_size: u32, + tso_batch_list: Arc, + renew_parameter: RenewParameter, mut rx: Receiver, ) { loop { @@ -267,8 +479,8 @@ impl BatchTsoProvider { let res = Self::renew_tso_batch_impl( pd_client.clone(), - tso_batch.clone(), - batch_min_size, + tso_batch_list.clone(), + renew_parameter, need_flush, ) .await @@ -283,28 +495,36 @@ impl BatchTsoProvider { } } - fn calc_new_batch_size(batch_size: u32, used_size: u32, batch_min_size: u32) -> u32 { - if used_size > batch_size * 3 / 4 { - // Enlarge to double if used more than 3/4. - std::cmp::min(batch_size << 1, TSO_BATCH_MAX_SIZE) - } else if used_size < batch_size / 4 { - // Shrink to half if used less than 1/4. - std::cmp::max(batch_size >> 1, batch_min_size) - } else { - batch_size + fn calc_new_batch_size( + tso_batch_list: Arc, + renew_parameter: RenewParameter, + need_flush: bool, + ) -> u32 { + // The expected number of TSO is `cache_multiplier` times on latest usage. + // Note: There is a `batch_max_size` limitation, so the request batch size will + // be less than expected, and will be fulfill in next renew. + // TODO: consider schedule TSO requests exceed `batch_max_size` limitation to + // fulfill requirement in time. + let mut new_batch_size = tso_batch_list.usage() * renew_parameter.cache_multiplier; + if !need_flush { + new_batch_size = new_batch_size.saturating_sub(tso_batch_list.remain()) } + std::cmp::min( + std::cmp::max(new_batch_size, renew_parameter.batch_min_size), + renew_parameter.batch_max_size, + ) } async fn init(&self, renew_request_rx: Receiver) -> Result<()> { // Spawn renew thread. let pd_client = self.pd_client.clone(); - let tso_batch = self.batch.clone(); - let batch_min_size = self.batch_min_size; + let tso_batch_list = self.batch_list.clone(); + let renew_parameter = self.renew_parameter; self.causal_ts_worker.remote().spawn(async move { - Self::renew_thread(pd_client, tso_batch, batch_min_size, renew_request_rx).await; + Self::renew_thread(pd_client, tso_batch_list, renew_parameter, renew_request_rx).await; }); - self.renew_tso_batch(true, TSO_BATCH_RENEW_ON_INITIALIZE) + self.renew_tso_batch(true, TsoBatchRenewReason::init) .await?; let request_tx = self.renew_request_tx.clone(); @@ -314,7 +534,7 @@ impl BatchTsoProvider { let _ = Self::renew_tso_batch_internal( request_tx, false, - TSO_BATCH_RENEW_BY_BACKGROUND, + TsoBatchRenewReason::background, ) .await; } @@ -328,33 +548,49 @@ impl BatchTsoProvider { Ok(()) } - // Get current batch_size, for test purpose. - pub fn batch_size(&self) -> u32 { - self.batch.read().size + #[cfg(test)] + pub fn tso_remain(&self) -> u32 { + self.batch_list.remain() + } + + #[cfg(test)] + pub fn tso_usage(&self) -> u32 { + self.batch_list.usage() + } + + #[cfg(test)] + pub fn get_ts(&self) -> Result { + block_on(self.async_get_ts()) + } + + #[cfg(test)] + pub fn flush(&self) -> Result { + block_on(self.async_flush()) } } const GET_TS_MAX_RETRY: u32 = 3; +#[async_trait] impl CausalTsProvider for BatchTsoProvider { - fn get_ts(&self) -> Result { + // TODO: support `after_ts` argument. + async fn async_get_ts(&self) -> Result { let start = Instant::now(); let mut retries = 0; let mut last_batch_size: u32; loop { { - let batch = self.batch.read(); - last_batch_size = batch.size; - match batch.pop() { + last_batch_size = self.batch_list.remain() + self.batch_list.usage(); + match self.batch_list.pop(None) { Some(ts) => { trace!("BatchTsoProvider::get_ts: {:?}", ts); - TS_PROVIDER_GET_TS_DURATION - .with_label_values(&["ok"]) + TS_PROVIDER_GET_TS_DURATION_STATIC + .ok .observe(start.saturating_elapsed_secs()); return Ok(ts); } None => { - warn!("BatchTsoProvider::get_ts, batch used up"; "batch.size" => batch.size, "retries" => retries); + warn!("BatchTsoProvider::get_ts, batch used up"; "last_batch_size" => last_batch_size, "retries" => retries); } } } @@ -362,23 +598,29 @@ impl CausalTsProvider for BatchTsoProvider { if retries >= GET_TS_MAX_RETRY { break; } - if let Err(err) = block_on(self.renew_tso_batch(false, TSO_BATCH_RENEW_FOR_USED_UP)) { - // `renew_tso_batch` failure is likely to be caused by TSO timeout, which would mean that PD is quite busy. - // So do not retry any more. + if let Err(err) = self + .renew_tso_batch(false, TsoBatchRenewReason::used_up) + .await + { + // `renew_tso_batch` failure is likely to be caused by TSO timeout, which would + // mean that PD is quite busy. So do not retry any more. error!("BatchTsoProvider::get_ts, renew_tso_batch fail on batch used-up"; "err" => ?err); break; } retries += 1; } - error!("BatchTsoProvider::get_ts, batch used up"; "batch.size" => last_batch_size, "retries" => retries); - TS_PROVIDER_GET_TS_DURATION - .with_label_values(&["err"]) + error!("BatchTsoProvider::get_ts, batch used up"; "last_batch_size" => last_batch_size, "retries" => retries); + TS_PROVIDER_GET_TS_DURATION_STATIC + .err .observe(start.saturating_elapsed_secs()); Err(Error::TsoBatchUsedUp(last_batch_size)) } - fn flush(&self) -> Result<()> { - block_on(self.renew_tso_batch(true, TSO_BATCH_RENEW_FOR_FLUSH)) + async fn async_flush(&self) -> Result { + self.renew_tso_batch(true, TsoBatchRenewReason::flush) + .await?; + // TODO: Return the first tso by renew_tso_batch instead of async_get_ts + self.async_get_ts().await } } @@ -394,73 +636,231 @@ impl SimpleTsoProvider { } } +#[async_trait] impl CausalTsProvider for SimpleTsoProvider { - fn get_ts(&self) -> Result { - let ts = block_on(self.pd_client.get_tso())?; + async fn async_get_ts(&self) -> Result { + let ts = self.pd_client.get_tso().await?; debug!("SimpleTsoProvider::get_ts"; "ts" => ?ts); Ok(ts) } + + async fn async_flush(&self) -> Result { + self.async_get_ts().await + } } #[cfg(test)] pub mod tests { - use test_raftstore::TestPdClient; + use futures::executor::block_on; + use test_pd_client::TestPdClient; use super::*; #[test] fn test_tso_batch() { - let mut batch = TsoBatch::default(); + let batch = TsoBatch::new(10, TimeStamp::compose(1, 100)); - assert_eq!(batch.used_size(), None); - assert_eq!(batch.pop(), None); - batch.flush(); + assert_eq!(batch.original_start(), TimeStamp::compose(1, 91)); + assert_eq!(batch.excluded_end(), TimeStamp::compose(1, 101)); + assert_eq!(batch.remain(), 10); - batch.renew(10, TimeStamp::compose(1, 100)).unwrap(); - for logical in 91..=95 { - assert_eq!(batch.pop(), Some(TimeStamp::compose(1, logical))); + for logical in 91..=93 { + assert_eq!(batch.pop(), Some((TimeStamp::compose(1, logical), false))); } - assert_eq!(batch.used_size(), Some(5)); + assert_eq!(batch.remain(), 7); - for logical in 96..=100 { - assert_eq!(batch.pop(), Some(TimeStamp::compose(1, logical))); + for logical in 94..=99 { + assert_eq!(batch.pop(), Some((TimeStamp::compose(1, logical), false))); } - assert_eq!(batch.used_size(), Some(10)); - assert_eq!(batch.pop(), None); - - batch.renew(10, TimeStamp::compose(1, 110)).unwrap(); - // timestamp fall back - assert!(batch.renew(10, TimeStamp::compose(1, 119)).is_err()); + assert_eq!(batch.remain(), 1); - batch.renew(10, TimeStamp::compose(1, 200)).unwrap(); - for logical in 191..=195 { - assert_eq!(batch.pop(), Some(TimeStamp::compose(1, logical))); - } - batch.flush(); - assert_eq!(batch.used_size(), Some(10)); + assert_eq!(batch.pop(), Some((TimeStamp::compose(1, 100), true))); assert_eq!(batch.pop(), None); + assert_eq!(batch.remain(), 0); } #[test] fn test_cals_new_batch_size() { + let cache_multiplier = 30; let cases = vec![ - (100, 0, 100), - (100, 76, 200), - (200, 49, 100), - (200, 50, 200), - (200, 150, 200), - (200, 151, 400), - (200, 200, 400), - (TSO_BATCH_MAX_SIZE, TSO_BATCH_MAX_SIZE, TSO_BATCH_MAX_SIZE), + (0, 0, true, 100), + (50, 0, true, 100), + (1000, 100, true, 3000), + ( + 1000, + DEFAULT_TSO_BATCH_MAX_SIZE, + true, + DEFAULT_TSO_BATCH_MAX_SIZE, + ), + (0, 0, false, 100), + (1000, 0, false, 100), + (1000, 100, false, 2000), + (5000, 100, false, 100), + ( + 1000, + DEFAULT_TSO_BATCH_MAX_SIZE, + false, + DEFAULT_TSO_BATCH_MAX_SIZE, + ), ]; - for (i, (batch_size, used_size, expected)) in cases.into_iter().enumerate() { - let new_size = - BatchTsoProvider::::calc_new_batch_size(batch_size, used_size, 100); + for (i, (remain, usage, need_flush, expected)) in cases.into_iter().enumerate() { + let batch_list = Arc::new(TsoBatchList { + inner: Default::default(), + tso_remain: AtomicI32::new(remain), + tso_usage: AtomicU32::new(usage), + capacity: cache_multiplier, + }); + let renew_parameter = RenewParameter { + batch_min_size: DEFAULT_TSO_BATCH_MIN_SIZE, + batch_max_size: DEFAULT_TSO_BATCH_MAX_SIZE, + cache_multiplier, + }; + let new_size = BatchTsoProvider::::calc_new_batch_size( + batch_list, + renew_parameter, + need_flush, + ); assert_eq!(new_size, expected, "case {}", i); } } + #[test] + fn test_tso_batch_list_basic() { + let batch_list = TsoBatchList::new(10); + + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 0); + assert_eq!(batch_list.pop(None), None); + + batch_list + .push(10, TimeStamp::compose(1, 100), false) + .unwrap(); + assert_eq!(batch_list.remain(), 10); + assert_eq!(batch_list.usage(), 0); + + for logical in 91..=94 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 6); + assert_eq!(batch_list.usage(), 4); + + for logical in 95..=100 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 10); + assert_eq!(batch_list.pop(None), None); + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 10); + + batch_list + .push(10, TimeStamp::compose(1, 110), false) + .unwrap(); + assert_eq!(batch_list.remain(), 10); + assert_eq!(batch_list.usage(), 10); + // timestamp fall back + batch_list + .push(10, TimeStamp::compose(1, 119), false) + .unwrap_err(); + batch_list + .push(10, TimeStamp::compose(1, 200), false) + .unwrap(); + assert_eq!(batch_list.remain(), 20); + assert_eq!(batch_list.usage(), 10); + + for logical in 101..=110 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + for logical in 191..=195 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 5); + assert_eq!(batch_list.usage(), 25); + + batch_list.flush(); + assert_eq!(batch_list.pop(None), None); + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.take_and_report_usage(), 25); + assert_eq!(batch_list.usage(), 0); + + // need_flush + batch_list + .push(10, TimeStamp::compose(1, 300), false) + .unwrap(); + let key391 = batch_list + .push(10, TimeStamp::compose(1, 400), true) + .unwrap(); + assert_eq!(key391, TimeStamp::compose(1, 391).into_inner()); + assert_eq!(batch_list.remain(), 10); + assert_eq!(batch_list.usage(), 0); + + for logical in 391..=400 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 10); + } + + #[test] + fn test_tso_batch_list_max_batch_count() { + let batch_list = TsoBatchList::new(3); + + batch_list + .push(10, TimeStamp::compose(1, 100), false) + .unwrap(); // will be remove after the 4th push. + batch_list + .push(10, TimeStamp::compose(1, 200), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 300), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 400), false) + .unwrap(); + + for logical in 191..=195 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 25); + assert_eq!(batch_list.usage(), 5); + } + + #[test] + fn test_tso_batch_list_pop_after_ts() { + let batch_list = TsoBatchList::new(10); + + batch_list + .push(10, TimeStamp::compose(1, 100), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 200), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 300), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 400), false) + .unwrap(); + + let after_ts = TimeStamp::compose(1, 291); + for logical in 291..=300 { + assert_eq!( + batch_list.pop(Some(after_ts)), + Some(TimeStamp::compose(1, logical)) + ); + } + for logical in 391..=400 { + assert_eq!( + batch_list.pop(Some(after_ts)), + Some(TimeStamp::compose(1, logical)) + ); + } + assert_eq!(batch_list.pop(Some(after_ts)), None); + assert_eq!(batch_list.remain(), 20); + assert_eq!(batch_list.usage(), 20); + } + #[test] fn test_simple_tso_provider() { let pd_cli = Arc::new(TestPdClient::new(1, false)); @@ -468,7 +868,7 @@ pub mod tests { let provider = SimpleTsoProvider::new(pd_cli.clone()); pd_cli.set_tso(100.into()); - let ts = provider.get_ts().unwrap(); + let ts = block_on(provider.async_get_ts()).unwrap(); assert_eq!(ts, 101.into(), "ts: {:?}", ts); } @@ -477,49 +877,67 @@ pub mod tests { let pd_cli = Arc::new(TestPdClient::new(1, false)); pd_cli.set_tso(1000.into()); - // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to renew manually. - // allocated: [1001, 1100] + // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to + // renew manually. allocated: [1001, 1100] let provider = block_on(BatchTsoProvider::new_opt( pd_cli.clone(), Duration::ZERO, + Duration::from_secs(1), // cache_multiplier = 10 100, + 80000, )) .unwrap(); - assert_eq!(provider.batch_size(), 100); + assert_eq!(provider.tso_remain(), 100); + assert_eq!(provider.tso_usage(), 0); + for ts in 1001..=1010u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + assert_eq!(provider.tso_remain(), 90); + assert_eq!(provider.tso_usage(), 10); - provider.flush().unwrap(); // allocated: [1101, 1200] - assert_eq!(provider.batch_size(), 100); + assert_eq!(provider.flush().unwrap(), TimeStamp::from(1101)); // allocated: [1101, 1200] + assert_eq!(provider.tso_remain(), 99); + assert_eq!(provider.tso_usage(), 1); // used up pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1101..=1200u64 { + for ts in 1102..=1200u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.get_ts().is_err()); - - provider.flush().unwrap(); // allocated: [1201, 1400] - assert_eq!(provider.batch_size(), 200); - - // used < 20% - for ts in 1201..=1249u64 { + assert_eq!(provider.tso_remain(), 0); + assert_eq!(provider.tso_usage(), 100); + provider.get_ts().unwrap_err(); + assert_eq!(provider.tso_remain(), 0); + assert_eq!(provider.tso_usage(), 100); + + assert_eq!(provider.flush().unwrap(), TimeStamp::from(1201)); // allocated: [1201, 2200] + for ts in 1202..=1260u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + assert_eq!(provider.tso_remain(), 940); + assert_eq!(provider.tso_usage(), 60); - provider.flush().unwrap(); // allocated: [1401, 1500] - assert_eq!(provider.batch_size(), 100); + // allocated: [2201, 2300] + block_on(provider.renew_tso_batch(false, TsoBatchRenewReason::background)).unwrap(); + assert_eq!(provider.tso_remain(), 1040); // 940 + 100 + assert_eq!(provider.tso_usage(), 0); pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1401..=1500u64 { + for ts in 1261..=2300u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.get_ts().is_err()); + provider.get_ts().unwrap_err(); + assert_eq!(provider.tso_remain(), 0); + assert_eq!(provider.tso_usage(), 1040); // renew on used-up - for ts in 1501..=2500u64 { + for ts in 2301..=100_000u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + // batch size: 10400, 80000, 80000 + // batch boundary: 2301, 12700, 92700, 100_000 + assert_eq!(provider.tso_remain(), 72700); + assert_eq!(provider.tso_usage(), 7300); } #[test] @@ -529,25 +947,27 @@ pub mod tests { { pd_cli.trigger_tso_failure(); - assert!( - block_on(BatchTsoProvider::new_opt( - pd_cli.clone(), - Duration::ZERO, - 100 - )) - .is_err() - ); + block_on(BatchTsoProvider::new_opt( + pd_cli.clone(), + Duration::ZERO, + Duration::from_secs(3), + 100, + 8192, + )) + .unwrap_err(); } - // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to renew manually. - // allocated: [1001, 1100] + // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to + // renew manually. allocated: [1001, 1100] let provider = block_on(BatchTsoProvider::new_opt( pd_cli.clone(), Duration::ZERO, + Duration::from_secs(1), // cache_multiplier=10 100, + 8192, )) .unwrap(); - assert_eq!(provider.batch_size(), 100); + assert_eq!(provider.tso_remain(), 100); for ts in 1001..=1010u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } @@ -557,23 +977,23 @@ pub mod tests { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.flush().is_err()); + provider.flush().unwrap_err(); for ts in 1101..=1300u64 { // renew on used-up, allocated: [1101, 1300] assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } pd_cli.trigger_tso_failure(); - assert!(provider.get_ts().is_err()); // renew fail on used-up + provider.get_ts().unwrap_err(); // renew fail on used-up pd_cli.trigger_tso_failure(); - assert!(provider.flush().is_err()); + provider.flush().unwrap_err(); - provider.flush().unwrap(); // allocated: [1301, 1700] + assert_eq!(provider.flush().unwrap(), TimeStamp::from(1301)); // allocated: [1301, 3300] pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1301..=1700u64 { + for ts in 1302..=3300u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.get_ts().is_err()); + provider.get_ts().unwrap_err(); } } diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index f2e2dfd57ce..3dfbb402d2e 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -28,49 +28,51 @@ mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] [dependencies] -api_version = { path = "../api_version" } +api_version = { workspace = true } bitflags = "1.0" -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" futures = "0.3" futures-timer = "3.0" getset = "0.1" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -keys = { path = "../keys" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +online_config = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } -resolved_ts = { path = "../resolved_ts", default-features = false } -security = { path = "../security", default-features = false } +raftstore = { workspace = true } +resolved_ts = { workspace = true } +security = { workspace = true } semver = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" -tikv = { path = "../..", default-features = false } -tikv_kv = { path = "../tikv_kv", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tikv = { workspace = true } +tikv_kv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } [dev-dependencies] criterion = "0.3" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tempfile = "3.0" -test_raftstore = { path = "../test_raftstore", default-features = false } -test_util = { path = "../test_util", default-features = false } +test_pd_client = { workspace = true } +test_raftstore = { workspace = true } +test_util = { workspace = true } [[test]] name = "integrations" diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index 94fe0f74c61..595632c306e 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -44,8 +44,9 @@ const CDC_RESP_MAX_BYTES: u32 = 6 * 1024 * 1024; /// Assume the average size of batched `CdcEvent::Event`s is 32KB and /// the average count of batched `CdcEvent::Event`s is 64. -/// +/// ```text /// 2 = (CDC_EVENT_MAX_BYTES * CDC_EVENT_MAX_COUNT / CDC_MAX_RESP_SIZE).ceil() + 1 /* reserve for ResolvedTs */; +/// ``` const CDC_RESP_MAX_BATCH_COUNT: usize = 2; pub enum CdcEvent { @@ -265,7 +266,7 @@ pub fn channel(buffer: usize, memory_quota: MemoryQuota) -> (Sink, Drain) { ) } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub enum SendError { Full, Disconnected, diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index dc9f36e92ec..adca54dace0 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -10,7 +10,7 @@ use std::{ }; use api_version::{ApiV2, KeyMode, KvFormat}; -use collections::HashMap; +use collections::{HashMap, HashMapEntry}; use crossbeam::atomic::AtomicCell; use kvproto::{ cdcpb::{ @@ -38,23 +38,24 @@ use crate::{ initializer::KvEntry, metrics::*, old_value::{OldValueCache, OldValueCallback}, - service::ConnID, + service::ConnId, + txn_source::TxnSource, Error, Result, }; static DOWNSTREAM_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier of a Downstream. -#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] -pub struct DownstreamID(usize); +#[derive(Clone, Copy, Debug, PartialEq, Hash)] +pub struct DownstreamId(usize); -impl DownstreamID { - pub fn new() -> DownstreamID { - DownstreamID(DOWNSTREAM_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) +impl DownstreamId { + pub fn new() -> DownstreamId { + DownstreamId(DOWNSTREAM_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) } } -impl Default for DownstreamID { +impl Default for DownstreamId { fn default() -> Self { Self::new() } @@ -64,10 +65,11 @@ impl Default for DownstreamID { pub enum DownstreamState { /// It's just created and rejects change events and resolved timestamps. Uninitialized, - /// It has got a snapshot for incremental scan, and change events will be accepted. - /// However it still rejects resolved timestamps. + /// It has got a snapshot for incremental scan, and change events will be + /// accepted. However it still rejects resolved timestamps. Initializing, - /// Incremental scan is finished so that resolved timestamps are acceptable now. + /// Incremental scan is finished so that resolved timestamps are acceptable + /// now. Normal, Stopped, } @@ -78,7 +80,8 @@ impl Default for DownstreamState { } } -/// Shold only be called when it's uninitialized or stopped. Return false if it's stopped. +/// Should only be called when it's uninitialized or stopped. Return false if +/// it's stopped. pub(crate) fn on_init_downstream(s: &AtomicCell) -> bool { s.compare_exchange( DownstreamState::Uninitialized, @@ -87,7 +90,8 @@ pub(crate) fn on_init_downstream(s: &AtomicCell) -> bool { .is_ok() } -/// Shold only be called when it's initializing or stopped. Return false if it's stopped. +/// Should only be called when it's initializing or stopped. Return false if +/// it's stopped. pub(crate) fn post_init_downstream(s: &AtomicCell) -> bool { s.compare_exchange(DownstreamState::Initializing, DownstreamState::Normal) .is_ok() @@ -116,16 +120,18 @@ impl DownstreamState { pub struct Downstream { // TODO: include cdc request. /// A unique identifier of the Downstream. - id: DownstreamID, + id: DownstreamId, // The request ID set by CDC to identify events corresponding different requests. req_id: u64, - conn_id: ConnID, + conn_id: ConnId, // The IP address of downstream. peer: String, region_epoch: RegionEpoch, sink: Option, state: Arc>, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, + pub(crate) observed_range: ObservedRange, } impl Downstream { @@ -137,11 +143,13 @@ impl Downstream { peer: String, region_epoch: RegionEpoch, req_id: u64, - conn_id: ConnID, + conn_id: ConnId, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, + observed_range: ObservedRange, ) -> Downstream { Downstream { - id: DownstreamID::new(), + id: DownstreamId::new(), req_id, conn_id, peer, @@ -149,6 +157,8 @@ impl Downstream { sink: None, state: Arc::new(AtomicCell::new(DownstreamState::default())), kv_api, + filter_loop, + observed_range, } } @@ -196,15 +206,19 @@ impl Downstream { self.sink = Some(sink); } - pub fn get_id(&self) -> DownstreamID { + pub fn get_id(&self) -> DownstreamId { self.id } + pub fn get_filter_loop(&self) -> bool { + self.filter_loop + } + pub fn get_state(&self) -> Arc> { self.state.clone() } - pub fn get_conn_id(&self) -> ConnID { + pub fn get_conn_id(&self) -> ConnId { self.conn_id } } @@ -244,7 +258,6 @@ pub struct Delegate { pending: Option, txn_extra_op: Arc>, failed: bool, - has_resolver: bool, } impl Delegate { @@ -259,14 +272,9 @@ impl Delegate { pending: Some(Pending::default()), txn_extra_op, failed: false, - has_resolver: false, } } - pub fn has_resolver(&self) -> bool { - self.has_resolver - } - /// Let downstream subscribe the delegate. /// Return error if subscribe fails and the `Delegate` won't be changed. pub fn subscribe(&mut self, downstream: Downstream) -> Result<()> { @@ -274,14 +282,11 @@ impl Delegate { // Check if the downstream is out dated. self.check_epoch_on_ready(&downstream)?; } - if downstream.kv_api == ChangeDataRequestKvApi::TiDb { - self.has_resolver = true; - } self.add_downstream(downstream); Ok(()) } - pub fn downstream(&self, downstream_id: DownstreamID) -> Option<&Downstream> { + pub fn downstream(&self, downstream_id: DownstreamId) -> Option<&Downstream> { self.downstreams().iter().find(|d| d.id == downstream_id) } @@ -301,7 +306,7 @@ impl Delegate { /// Let downstream unsubscribe the delegate. /// Return whether the delegate is empty or not. - pub fn unsubscribe(&mut self, id: DownstreamID, err: Option) -> bool { + pub fn unsubscribe(&mut self, id: DownstreamId, err: Option) -> bool { let error_event = err.map(|err| err.into_error_event(self.region_id)); let region_id = self.region_id; if let Some(d) = self.remove_downstream(id) { @@ -355,9 +360,10 @@ impl Delegate { let _ = self.broadcast(send); } - /// `txn_extra_op` returns a shared flag which is accessed in TiKV's transaction layer to - /// determine whether to capture modifications' old value or not. Unsubsribing all downstreams - /// or calling `Delegate::stop` will store it with `TxnExtraOp::Noop`. + /// `txn_extra_op` returns a shared flag which is accessed in TiKV's + /// transaction layer to determine whether to capture modifications' old + /// value or not. Unsubscribing all downstreams or calling + /// `Delegate::stop` will store it with `TxnExtraOp::Noop`. /// /// NOTE: Dropping a `Delegate` won't update this flag. pub fn txn_extra_op(&self) -> &AtomicCell { @@ -380,7 +386,8 @@ impl Delegate { Ok(()) } - /// Install a resolver. Return downstreams which fail because of the region's internal changes. + /// Install a resolver. Return downstreams which fail because of the + /// region's internal changes. pub fn on_region_ready( &mut self, mut resolver: Resolver, @@ -392,6 +399,11 @@ impl Delegate { self.region_id, ); + // Check observed key range in region. + for downstream in self.downstreams_mut() { + downstream.observed_range.update_region_key_range(®ion); + } + // Mark the delegate as initialized. let mut pending = self.pending.take().unwrap(); self.region = Some(region); @@ -427,8 +439,6 @@ impl Delegate { let resolved_ts = resolver.resolve(min_ts); debug!("cdc resolved ts updated"; "region_id" => self.region_id, "resolved_ts" => resolved_ts); - CDC_RESOLVED_TS_GAP_HISTOGRAM - .observe((min_ts.physical() - resolved_ts.physical()) as f64 / 1000f64); Some(resolved_ts) } @@ -446,6 +456,7 @@ impl Delegate { for cmd in batch.into_iter(self.region_id) { let Cmd { index, + term: _, mut request, mut response, } = cmd; @@ -476,88 +487,82 @@ impl Delegate { region_id: u64, request_id: u64, entries: Vec>, + filter_loop: bool, + observed_range: &ObservedRange, ) -> Result> { let entries_len = entries.len(); let mut rows = vec![Vec::with_capacity(entries_len)]; let mut current_rows_size: usize = 0; for entry in entries { + let (mut row, mut _has_value) = (EventRow::default(), false); + let row_size: usize; match entry { Some(KvEntry::RawKvEntry(kv_pair)) => { - let mut row = EventRow::default(); decode_rawkv(kv_pair.0, kv_pair.1, &mut row)?; - let row_size = row.key.len() + row.value.len(); - if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { - rows.push(Vec::with_capacity(entries_len)); - current_rows_size = 0; + row_size = row.key.len() + row.value.len(); + } + Some(KvEntry::TxnEntry(TxnEntry::Prewrite { + default, + lock, + old_value, + })) => { + if !observed_range.contains_encoded_key(&lock.0) { + continue; } - current_rows_size += row_size; - rows.last_mut().unwrap().push(row); + let l = Lock::parse(&lock.1).unwrap(); + if decode_lock(lock.0, l, &mut row, &mut _has_value) { + continue; + } + decode_default(default.1, &mut row, &mut _has_value); + row.old_value = old_value.finalized().unwrap_or_default(); + row_size = row.key.len() + row.value.len(); } - Some(KvEntry::TxnEntry(txn_entry)) => { - match txn_entry { - TxnEntry::Prewrite { - default, - lock, - old_value, - } => { - let mut row = EventRow::default(); - let skip = decode_lock(lock.0, Lock::parse(&lock.1).unwrap(), &mut row); - if skip { - continue; - } - decode_default(default.1, &mut row); - let row_size = row.key.len() + row.value.len(); - if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { - rows.push(Vec::with_capacity(entries_len)); - current_rows_size = 0; - } - current_rows_size += row_size; - row.old_value = old_value.finalized().unwrap_or_default(); - rows.last_mut().unwrap().push(row); - } - TxnEntry::Commit { - default, - write, - old_value, - } => { - let mut row = EventRow::default(); - let skip = decode_write(write.0, &write.1, &mut row, false); - if skip { - continue; - } - decode_default(default.1, &mut row); - - // This type means the row is self-contained, it has, - // 1. start_ts - // 2. commit_ts - // 3. key - // 4. value - if row.get_type() == EventLogType::Rollback { - // We dont need to send rollbacks to downstream, - // because downstream does not needs rollback to clean - // prewrite as it drops all previous stashed data. - continue; - } - set_event_row_type(&mut row, EventLogType::Committed); - row.old_value = old_value.finalized().unwrap_or_default(); - let row_size = row.key.len() + row.value.len(); - if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { - rows.push(Vec::with_capacity(entries_len)); - current_rows_size = 0; - } - current_rows_size += row_size; - rows.last_mut().unwrap().push(row); - } + Some(KvEntry::TxnEntry(TxnEntry::Commit { + default, + write, + old_value, + })) => { + if !observed_range.contains_encoded_key(&write.0) { + continue; + } + if decode_write(write.0, &write.1, &mut row, &mut _has_value, false) { + continue; } + decode_default(default.1, &mut row, &mut _has_value); + + // This type means the row is self-contained, it has, + // 1. start_ts + // 2. commit_ts + // 3. key + // 4. value + if row.get_type() == EventLogType::Rollback { + // We dont need to send rollbacks to downstream, + // because downstream does not needs rollback to clean + // prewrite as it drops all previous stashed data. + continue; + } + set_event_row_type(&mut row, EventLogType::Committed); + row.old_value = old_value.finalized().unwrap_or_default(); + row_size = row.key.len() + row.value.len(); } None => { - let mut row = EventRow::default(); - // This type means scan has finished. set_event_row_type(&mut row, EventLogType::Initialized); - rows.last_mut().unwrap().push(row); + row_size = 0; } } + let lossy_ddl_filter = TxnSource::is_lossy_ddl_reorg_source_set(row.txn_source); + let cdc_write_filter = + TxnSource::is_cdc_write_source_set(row.txn_source) && filter_loop; + if lossy_ddl_filter || cdc_write_filter { + continue; + } + if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { + rows.push(Vec::with_capacity(entries_len)); + current_rows_size = 0; + } + current_rows_size += row_size; + rows.last_mut().unwrap().push(row); } let rows = rows @@ -596,7 +601,8 @@ impl Delegate { Ok(()) }; - let mut txn_rows: HashMap, EventRow> = HashMap::default(); + // map[key] -> (event, has_value). + let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); let mut raw_rows: Vec = Vec::new(); for mut req in requests { match req.get_cmd_type() { @@ -620,19 +626,19 @@ impl Delegate { } } - if !txn_rows.is_empty() { - let mut rows = Vec::with_capacity(txn_rows.len()); - for (_, v) in txn_rows { - rows.push(v); + let mut rows = Vec::with_capacity(txn_rows.len()); + for (_, (v, has_value)) in txn_rows { + if v.r_type == EventLogType::Prewrite && v.op_type == EventRowOpType::Put && !has_value + { + // It's possible that a prewrite command only contains lock but without + // default. It's not documented by classic Percolator but introduced with + // Large-Transaction. Those prewrites are not complete, we must skip them. + continue; } - self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; - } - - if !raw_rows.is_empty() { - self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv)?; + rows.push(v); } - - Ok(()) + self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; + self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv) } fn sink_downstream( @@ -641,24 +647,77 @@ impl Delegate { index: u64, kv_api: ChangeDataRequestKvApi, ) -> Result<()> { - let event_entries = EventEntries { - entries: entries.into(), - ..Default::default() - }; - let change_data_event = Event { - region_id: self.region_id, - index, - event: Some(Event_oneof_event::Entries(event_entries)), - ..Default::default() - }; + if entries.is_empty() { + return Ok(()); + } + + // Filter the entries which are lossy DDL events. + // We don't need to send them to downstream. + let entries = entries + .iter() + .filter(|x| !TxnSource::is_lossy_ddl_reorg_source_set(x.txn_source)) + .cloned() + .collect::>(); + + let downstreams = self.downstreams(); + assert!( + !downstreams.is_empty(), + "region {} miss downstream", + self.region_id + ); + + // Collect the change event cause by user write, which cdc write source is not + // set. For changefeed which only need the user write, + // send the `filtered_entries`, or else, send them all. + let mut filtered_entries = None; + for downstream in downstreams { + if downstream.filter_loop { + let filtered = entries + .iter() + .filter(|x| !TxnSource::is_cdc_write_source_set(x.txn_source)) + .cloned() + .collect::>(); + if !filtered.is_empty() { + filtered_entries = Some(filtered); + } + break; + } + } + + let region_id = self.region_id; let send = move |downstream: &Downstream| { - // No ready downstream or a downstream that does not match the kv_api type, will be ignored. - // There will be one region that contains both Txn & Raw entries. + // No ready downstream or a downstream that does not match the kv_api type, will + // be ignored. There will be one region that contains both Txn & Raw entries. // The judgement here is for sending entries to downstreams with correct kv_api. if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { return Ok(()); } - let event = change_data_event.clone(); + if downstream.filter_loop && filtered_entries.is_none() { + return Ok(()); + } + + let entries_clone = if downstream.filter_loop { + downstream + .observed_range + .filter_entries(filtered_entries.clone().unwrap()) + } else { + downstream.observed_range.filter_entries(entries.clone()) + }; + + if entries_clone.is_empty() { + return Ok(()); + } + + let event = Event { + region_id, + index, + event: Some(Event_oneof_event::Entries(EventEntries { + entries: entries_clone.into(), + ..Default::default() + })), + ..Default::default() + }; + // Do not force send for real time change data events. let force_send = false; downstream.sink_event(event, force_send) @@ -676,7 +735,7 @@ impl Delegate { &mut self, put: PutRequest, is_one_pc: bool, - txn_rows: &mut HashMap, EventRow>, + txn_rows: &mut HashMap, (EventRow, bool)>, raw_rows: &mut Vec, read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { @@ -699,13 +758,13 @@ impl Delegate { &mut self, mut put: PutRequest, is_one_pc: bool, - rows: &mut HashMap, EventRow>, + rows: &mut HashMap, (EventRow, bool)>, mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { match put.cf.as_str() { "write" => { - let mut row = EventRow::default(); - if decode_write(put.take_key(), put.get_value(), &mut row, true) { + let (mut row, mut has_value) = (EventRow::default(), false); + if decode_write(put.take_key(), &put.value, &mut row, &mut has_value, true) { return Ok(()); } @@ -734,36 +793,29 @@ impl Delegate { ); } - match rows.get_mut(&row.key) { - Some(row_with_value) => { - row.value = mem::take(&mut row_with_value.value); - *row_with_value = row; + match rows.entry(row.key.clone()) { + HashMapEntry::Occupied(o) => { + let o = o.into_mut(); + mem::swap(&mut o.0.value, &mut row.value); + o.0 = row; } - None => { - rows.insert(row.key.clone(), row); + HashMapEntry::Vacant(v) => { + v.insert((row, has_value)); } } } "lock" => { - let mut row = EventRow::default(); + let (mut row, mut has_value) = (EventRow::default(), false); let lock = Lock::parse(put.get_value()).unwrap(); let for_update_ts = lock.for_update_ts; - if decode_lock(put.take_key(), lock, &mut row) { + if decode_lock(put.take_key(), lock, &mut row, &mut has_value) { return Ok(()); } let read_old_ts = std::cmp::max(for_update_ts, row.start_ts.into()); read_old_value(&mut row, read_old_ts)?; - let occupied = rows.entry(row.key.clone()).or_default(); - if !occupied.value.is_empty() { - assert!(row.value.is_empty()); - let mut value = vec![]; - mem::swap(&mut occupied.value, &mut value); - row.value = value; - } - // In order to compute resolved ts, - // we must track inflight txns. + // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { resolver.track_lock(row.start_ts.into(), row.key.clone(), None) @@ -780,16 +832,20 @@ impl Delegate { } } - *occupied = row; + let occupied = rows.entry(row.key.clone()).or_default(); + if occupied.1 { + assert!(!has_value); + has_value = true; + mem::swap(&mut occupied.0.value, &mut row.value); + } + *occupied = (row, has_value); } "" | "default" => { let key = Key::from_encoded(put.take_key()).truncate_ts().unwrap(); let row = rows.entry(key.into_raw().unwrap()).or_default(); - decode_default(put.take_value(), row); - } - other => { - panic!("invalid cf {}", other); + decode_default(put.take_value(), &mut row.0, &mut row.1); } + other => panic!("invalid cf {}", other), } Ok(()) } @@ -846,7 +902,7 @@ impl Delegate { self.txn_extra_op.store(TxnExtraOp::ReadOldValue); } - fn remove_downstream(&mut self, id: DownstreamID) -> Option { + fn remove_downstream(&mut self, id: DownstreamId) -> Option { let downstreams = self.downstreams_mut(); if let Some(index) = downstreams.iter().position(|x| x.id == id) { let downstream = downstreams.swap_remove(index); @@ -865,9 +921,9 @@ impl Delegate { if let Err(e) = compare_region_epoch( &downstream.region_epoch, region, - false, /* check_conf_ver */ - true, /* check_ver */ - true, /* include_region */ + false, // check_conf_ver + true, // check_ver + true, // include_region ) { info!( "cdc fail to subscribe downstream"; @@ -906,16 +962,23 @@ fn make_overlapped_rollback(key: Key, row: &mut EventRow) { set_event_row_type(row, EventLogType::Rollback); } -/// Decodes the write record and store its information in `row`. This may be called both when -/// doing incremental scan of observing apply events. There's different behavior for the two -/// case, distinguished by the `is_apply` parameter. -fn decode_write(key: Vec, value: &[u8], row: &mut EventRow, is_apply: bool) -> bool { +/// Decodes the write record and store its information in `row`. This may be +/// called both when doing incremental scan of observing apply events. There's +/// different behavior for the two case, distinguished by the `is_apply` +/// parameter. +fn decode_write( + key: Vec, + value: &[u8], + row: &mut EventRow, + has_value: &mut bool, + is_apply: bool, +) -> bool { let key = Key::from_encoded(key); let write = WriteRef::parse(value).unwrap().to_owned(); // For scanning, ignore the GC fence and read the old data; - // For observed apply, drop the record it self but keep only the overlapped rollback information - // if gc_fence exists. + // For observed apply, drop the record it self but keep only the overlapped + // rollback information if gc_fence exists. if is_apply && write.gc_fence.is_some() { // `gc_fence` is set means the write record has been rewritten. // Currently the only case is writing overlapped_rollback. And in this case @@ -935,6 +998,7 @@ fn decode_write(key: Vec, value: &[u8], row: &mut EventRow, is_apply: bool) } }; let commit_ts = if write.write_type == WriteType::Rollback { + assert_eq!(write.txn_source, 0); 0 } else { key.decode_ts().unwrap().into_inner() @@ -943,15 +1007,18 @@ fn decode_write(key: Vec, value: &[u8], row: &mut EventRow, is_apply: bool) row.commit_ts = commit_ts; row.key = key.truncate_ts().unwrap().into_raw().unwrap(); row.op_type = op_type as _; + // used for filter out the event. see `txn_source` field for more detail. + row.txn_source = write.txn_source; set_event_row_type(row, r_type); if let Some(value) = write.short_value { row.value = value; + *has_value = true; } false } -fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow) -> bool { +fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow, has_value: &mut bool) -> bool { let op_type = match lock.lock_type { LockType::Put => EventRowOpType::Put, LockType::Delete => EventRowOpType::Delete, @@ -968,9 +1035,12 @@ fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow) -> bool { row.start_ts = lock.ts.into_inner(); row.key = key.into_raw().unwrap(); row.op_type = op_type as _; + // used for filter out the event. see `txn_source` field for more detail. + row.txn_source = lock.txn_source; set_event_row_type(row, EventLogType::Prewrite); if let Some(value) = lock.short_value { row.value = value; + *has_value = true; } false @@ -998,10 +1068,76 @@ fn decode_rawkv(key: Vec, value: Vec, row: &mut EventRow) -> Result<()> Ok(()) } -fn decode_default(value: Vec, row: &mut EventRow) { +fn decode_default(value: Vec, row: &mut EventRow, has_value: &mut bool) { if !value.is_empty() { row.value = value.to_vec(); } + // If default CF is given in a command it means the command always has a value. + *has_value = true; +} + +/// Observed key range. +#[derive(Clone, Default)] +pub struct ObservedRange { + start_key_encoded: Vec, + end_key_encoded: Vec, + start_key_raw: Vec, + end_key_raw: Vec, + pub(crate) all_key_covered: bool, +} + +impl ObservedRange { + pub fn new(start_key_encoded: Vec, end_key_encoded: Vec) -> Result { + let start_key_raw = Key::from_encoded(start_key_encoded.clone()) + .into_raw() + .map_err(|e| Error::Other(e.into()))?; + let end_key_raw = Key::from_encoded(end_key_encoded.clone()) + .into_raw() + .map_err(|e| Error::Other(e.into()))?; + Ok(ObservedRange { + start_key_encoded, + end_key_encoded, + start_key_raw, + end_key_raw, + all_key_covered: false, + }) + } + + #[allow(clippy::collapsible_if)] + pub fn update_region_key_range(&mut self, region: &Region) { + // Check observed key range in region. + if self.start_key_encoded <= region.start_key { + if self.end_key_encoded.is_empty() + || (region.end_key <= self.end_key_encoded && !region.end_key.is_empty()) + { + // Observed range covers the region. + self.all_key_covered = true; + } + } + } + + fn is_key_in_range(&self, start_key: &[u8], end_key: &[u8], key: &[u8]) -> bool { + if self.all_key_covered { + return true; + } + if start_key <= key && (key < end_key || end_key.is_empty()) { + return true; + } + false + } + + pub fn contains_encoded_key(&self, key: &[u8]) -> bool { + self.is_key_in_range(&self.start_key_encoded, &self.end_key_encoded, key) + } + + pub fn filter_entries(&self, mut entries: Vec) -> Vec { + if self.all_key_covered { + return entries; + } + // Entry's key is in raw key format. + entries.retain(|e| self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, &e.key)); + entries + } } #[cfg(test)] @@ -1013,6 +1149,7 @@ mod tests { use kvproto::{errorpb::Error as ErrorHeader, metapb::Region}; use super::*; + use crate::channel::{channel, recv_timeout, MemoryQuota}; #[test] fn test_error() { @@ -1032,8 +1169,10 @@ mod tests { String::new(), region_epoch, request_id, - ConnID::new(), + ConnId::new(), ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); downstream.set_sink(sink); let mut delegate = Delegate::new(region_id, Default::default()); @@ -1041,6 +1180,7 @@ mod tests { assert!(delegate.handle.is_observing()); let resolver = Resolver::new(region_id); assert!(delegate.on_region_ready(resolver, region).is_empty()); + assert!(delegate.downstreams()[0].observed_range.all_key_covered); let rx_wrap = Cell::new(Some(rx)); let receive_error = || { @@ -1151,7 +1291,15 @@ mod tests { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(region_version); epoch.set_version(region_version); - Downstream::new(peer, epoch, id, ConnID::new(), ChangeDataRequestKvApi::TiDb) + Downstream::new( + peer, + epoch, + id, + ConnId::new(), + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ) }; // Create a new delegate. @@ -1191,7 +1339,7 @@ mod tests { assert!(delegate.handle.is_observing()); // Subscribe with an invalid epoch. - assert!(delegate.subscribe(new_downstream(1, 2)).is_err()); + delegate.subscribe(new_downstream(1, 2)).unwrap_err(); assert_eq!(delegate.downstreams().len(), 1); // Unsubscribe all downstreams. @@ -1201,6 +1349,239 @@ mod tests { assert!(!delegate.handle.is_observing()); } + #[test] + fn test_observed_range() { + for case in vec![ + (b"".as_slice(), b"".as_slice(), false), + (b"a", b"", false), + (b"", b"b", false), + (b"a", b"b", true), + (b"a", b"bb", false), + (b"a", b"aa", true), + (b"aa", b"aaa", true), + ] { + let start_key = if !case.0.is_empty() { + Key::from_raw(case.0).into_encoded() + } else { + case.0.to_owned() + }; + let end_key = if !case.1.is_empty() { + Key::from_raw(case.1).into_encoded() + } else { + case.1.to_owned() + }; + let mut region = Region::default(); + region.start_key = start_key.to_owned(); + region.end_key = end_key.to_owned(); + + for k in 0..=0xff { + let mut observed_range = ObservedRange::default(); + observed_range.update_region_key_range(®ion); + assert!(observed_range.contains_encoded_key(&Key::from_raw(&[k]).into_encoded())); + } + let mut observed_range = ObservedRange::new( + Key::from_raw(b"a").into_encoded(), + Key::from_raw(b"b").into_encoded(), + ) + .unwrap(); + observed_range.update_region_key_range(®ion); + assert_eq!(observed_range.all_key_covered, case.2, "{:?}", case); + assert!( + observed_range.contains_encoded_key(&Key::from_raw(b"a").into_encoded()), + "{:?}", + case + ); + assert!( + observed_range.contains_encoded_key(&Key::from_raw(b"ab").into_encoded()), + "{:?}", + case + ); + if observed_range.all_key_covered { + assert!( + observed_range.contains_encoded_key(&Key::from_raw(b"b").into_encoded()), + "{:?}", + case + ); + } else { + assert!( + !observed_range.contains_encoded_key(&Key::from_raw(b"b").into_encoded()), + "{:?}", + case + ); + } + } + } + + #[test] + fn test_downstream_filter_entires() { + // Create a new delegate that observes [b, d). + let observed_range = ObservedRange::new( + Key::from_raw(b"b").into_encoded(), + Key::from_raw(b"d").into_encoded(), + ) + .unwrap(); + let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); + let mut delegate = Delegate::new(1, txn_extra_op); + assert!(delegate.handle.is_observing()); + + let mut map = HashMap::default(); + for k in b'a'..=b'e' { + let mut put = PutRequest::default(); + put.key = Key::from_raw(&[k]).into_encoded(); + put.cf = "lock".to_owned(); + put.value = Lock::new( + LockType::Put, + put.key.clone(), + 1.into(), + 10, + None, + TimeStamp::zero(), + 0, + TimeStamp::zero(), + ) + .to_bytes(); + delegate + .sink_txn_put( + put, + false, + &mut map, + |_: &mut EventRow, _: TimeStamp| Ok(()), + ) + .unwrap(); + } + assert_eq!(map.len(), 5); + + let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let downstream = Downstream { + id: DownstreamId::new(), + req_id: 1, + conn_id: ConnId::new(), + peer: String::new(), + region_epoch: RegionEpoch::default(), + sink: Some(sink), + state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + kv_api: ChangeDataRequestKvApi::TiDb, + filter_loop: false, + observed_range, + }; + delegate.add_downstream(downstream); + let entries = map.values().map(|(r, _)| r).cloned().collect(); + delegate + .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .unwrap(); + + let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.spawn(async move { + drain.forward(&mut tx).await.unwrap(); + }); + let (e, _) = recv_timeout(&mut rx, std::time::Duration::from_secs(5)) + .unwrap() + .unwrap(); + assert_eq!(e.events[0].get_entries().get_entries().len(), 2, "{:?}", e); + } + + fn test_downstream_txn_source_filter(txn_source: TxnSource, filter_loop: bool) { + // Create a new delegate that observes [a, f). + let observed_range = ObservedRange::new( + Key::from_raw(b"a").into_encoded(), + Key::from_raw(b"f").into_encoded(), + ) + .unwrap(); + let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); + let mut delegate = Delegate::new(1, txn_extra_op); + assert!(delegate.handle.is_observing()); + + let mut map = HashMap::default(); + for k in b'a'..=b'e' { + let mut put = PutRequest::default(); + put.key = Key::from_raw(&[k]).into_encoded(); + put.cf = "lock".to_owned(); + let mut lock = Lock::new( + LockType::Put, + put.key.clone(), + 1.into(), + 10, + None, + TimeStamp::zero(), + 0, + TimeStamp::zero(), + ); + // Only the key `a` is a normal write. + if k != b'a' { + lock = lock.set_txn_source(txn_source.into()); + } + put.value = lock.to_bytes(); + delegate + .sink_txn_put( + put, + false, + &mut map, + |_: &mut EventRow, _: TimeStamp| Ok(()), + ) + .unwrap(); + } + assert_eq!(map.len(), 5); + + let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let downstream = Downstream { + id: DownstreamId::new(), + req_id: 1, + conn_id: ConnId::new(), + peer: String::new(), + region_epoch: RegionEpoch::default(), + sink: Some(sink), + state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + kv_api: ChangeDataRequestKvApi::TiDb, + filter_loop, + observed_range, + }; + delegate.add_downstream(downstream); + let entries = map.values().map(|(r, _)| r).cloned().collect(); + delegate + .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .unwrap(); + + let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.spawn(async move { + drain.forward(&mut tx).await.unwrap(); + }); + let (e, _) = recv_timeout(&mut rx, std::time::Duration::from_secs(5)) + .unwrap() + .unwrap(); + assert_eq!(e.events[0].get_entries().get_entries().len(), 1, "{:?}", e); + } + + #[test] + fn test_downstream_filter_cdc_write_entires() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + + test_downstream_txn_source_filter(txn_source, true); + } + + #[test] + fn test_downstream_filter_lossy_ddl_entires() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is false, we should still ignore lossy + // ddl changes. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is true, we should still ignore some + // events. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, true); + } + #[test] fn test_decode_rawkv() { let cases = vec![ diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 0a0a7d9fcd5..fd4580d4aea 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cell::RefCell, cmp::{Ord, Ordering as CmpOrdering, PartialOrd, Reverse}, collections::BinaryHeap, fmt, @@ -8,6 +9,7 @@ use std::{ time::Duration, }; +use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use collections::{HashMap, HashMapEntry, HashSet}; use concurrency_manager::ConcurrencyManager; use crossbeam::atomic::AtomicCell; @@ -17,53 +19,54 @@ use futures::compat::Future01CompatExt; use grpcio::Environment; use kvproto::{ cdcpb::{ - ChangeDataRequest, ChangeDataRequestKvApi, ClusterIdMismatch as ErrorClusterIdMismatch, + ChangeDataRequest, ClusterIdMismatch as ErrorClusterIdMismatch, Compatibility as ErrorCompatibility, DuplicateRequest as ErrorDuplicateRequest, Error as EventError, Event, Event_oneof_event, ResolvedTs, }, kvrpcpb::ApiVersion, metapb::Region, - tikvpb::TikvClient, }; use online_config::{ConfigChange, OnlineConfig}; use pd_client::{Feature, PdClient}; use raftstore::{ - coprocessor::{CmdBatch, ObserveID}, - router::RaftStoreRouter, - store::{ - fsm::{ChangeObserver, StoreMeta}, - msg::{Callback, SignificantMsg}, - RegionReadProgressRegistry, - }, + coprocessor::{CmdBatch, ObserveId}, + router::CdcHandle, + store::fsm::{store::StoreRegionMeta, ChangeObserver}, }; -use resolved_ts::Resolver; +use resolved_ts::{resolve_by_raft, LeadershipResolver, Resolver}; use security::SecurityManager; -use tikv::{config::CdcConfig, storage::Statistics}; +use tikv::{ + config::CdcConfig, + storage::{kv::LocalTablets, Statistics}, +}; use tikv_util::{ - debug, error, impl_display_as_debug, info, - time::Limiter, + debug, defer, error, impl_display_as_debug, info, + mpsc::bounded, + slow_log, + sys::thread::ThreadBuildWrapper, + time::{Instant, Limiter, SlowTimer}, timer::SteadyTimer, warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, }; use tokio::{ runtime::{Builder, Runtime}, - sync::{Mutex, Semaphore}, + sync::Semaphore, }; use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ channel::{CdcEvent, MemoryQuota, SendError}, - delegate::{on_init_downstream, Delegate, Downstream, DownstreamID, DownstreamState}, + delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, initializer::Initializer, metrics::*, old_value::{OldValueCache, OldValueCallback}, - service::{Conn, ConnID, FeatureGate}, + service::{Conn, ConnId, FeatureGate}, CdcObserver, Error, }; const FEATURE_RESOLVED_TS_STORE: Feature = Feature::require(5, 0, 0); -const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s +const METRICS_FLUSH_INTERVAL: u64 = 1_000; // 1s // 10 minutes, it's the default gc life time of TiDB // and is long enough for most transactions. const WARN_RESOLVED_TS_LAG_THRESHOLD: Duration = Duration::from_secs(600); @@ -73,16 +76,16 @@ const WARN_RESOLVED_TS_COUNT_THRESHOLD: usize = 10; pub enum Deregister { Downstream { region_id: u64, - downstream_id: DownstreamID, - conn_id: ConnID, + downstream_id: DownstreamId, + conn_id: ConnId, err: Option, }, Delegate { region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, err: Error, }, - Conn(ConnID), + Conn(ConnId), } impl_display_as_debug!(Deregister); @@ -132,7 +135,7 @@ pub enum Task { Register { request: ChangeDataRequest, downstream: Downstream, - conn_id: ConnID, + conn_id: ConnId, version: semver::Version, }, Deregister(Deregister), @@ -143,21 +146,26 @@ pub enum Task { multi: Vec, old_value_cb: OldValueCallback, }, - MinTS { + MinTs { regions: Vec, min_ts: TimeStamp, + current_ts: TimeStamp, }, ResolverReady { - observe_id: ObserveID, + observe_id: ObserveId, region: Region, resolver: Resolver, }, - RegisterMinTsEvent, + RegisterMinTsEvent { + leader_resolver: LeadershipResolver, + // The time at which the event actually occurred. + event_time: Instant, + }, // The result of ChangeCmd should be returned from CDC Endpoint to ensure // the downstream switches to Normal after the previous commands was sunk. InitDownstream { region_id: u64, - downstream_id: DownstreamID, + downstream_id: DownstreamId, downstream_state: Arc>, // `incremental_scan_barrier` will be sent into `sink` to ensure all delta changes // are delivered to the downstream. And then incremental scan can start. @@ -202,9 +210,15 @@ impl fmt::Debug for Task { .field("type", &"multi_batch") .field("multi_batch", &multi.len()) .finish(), - Task::MinTS { ref min_ts, .. } => { - de.field("type", &"mit_ts").field("min_ts", min_ts).finish() - } + Task::MinTs { + ref min_ts, + ref current_ts, + .. + } => de + .field("type", &"mit_ts") + .field("current_ts", current_ts) + .field("min_ts", min_ts) + .finish(), Task::ResolverReady { ref observe_id, ref region, @@ -214,7 +228,9 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("region_id", ®ion.get_id()) .finish(), - Task::RegisterMinTsEvent => de.field("type", &"register_min_ts").finish(), + Task::RegisterMinTsEvent { ref event_time, .. } => { + de.field("event_time", &event_time).finish() + } Task::InitDownstream { ref region_id, ref downstream_id, @@ -237,7 +253,7 @@ impl fmt::Debug for Task { } } -#[derive(PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] struct ResolvedRegion { region_id: u64, resolved_ts: TimeStamp, @@ -285,16 +301,8 @@ impl ResolvedRegionHeap { (min_resolved_ts, outliers) } - fn to_hash_set(&self) -> (TimeStamp, HashSet) { - let mut min_resolved_ts = TimeStamp::max(); - let mut regions = HashSet::with_capacity_and_hasher(self.heap.len(), Default::default()); - for resolved_region in &self.heap { - regions.insert(resolved_region.0.region_id); - if min_resolved_ts > resolved_region.0.resolved_ts { - min_resolved_ts = resolved_region.0.resolved_ts; - } - } - (min_resolved_ts, regions) + fn is_empty(&self) -> bool { + self.heap.is_empty() } fn clear(&mut self) { @@ -307,24 +315,25 @@ impl ResolvedRegionHeap { } } -pub struct Endpoint { +pub struct Endpoint { cluster_id: u64, capture_regions: HashMap, - connections: HashMap, + connections: HashMap, scheduler: Scheduler, - raft_router: T, - engine: E, + cdc_handle: T, + tablets: LocalTablets, observer: CdcObserver, pd_client: Arc, timer: SteadyTimer, tso_worker: Runtime, - store_meta: Arc>, - /// The concurrency manager for transactions. It's needed for CDC to check locks when - /// calculating resolved_ts. + store_meta: Arc>, + /// The concurrency manager for transactions. It's needed for CDC to check + /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, + raftstore_v2: bool, config: CdcConfig, api_version: ApiVersion, @@ -337,16 +346,12 @@ pub struct Endpoint { sink_memory_quota: MemoryQuota, old_value_cache: OldValueCache, - resolved_region_heap: ResolvedRegionHeap, + resolved_region_heap: RefCell, - // Check leader - // store_id -> client - tikv_clients: Arc>>, - env: Arc, - security_mgr: Arc, - region_read_progress: RegionReadProgressRegistry, + causal_ts_provider: Option>, // Metrics and logging. + current_ts: TimeStamp, min_resolved_ts: TimeStamp, min_ts_region_id: u64, resolved_region_count: usize, @@ -354,35 +359,42 @@ pub struct Endpoint { warn_resolved_ts_repeat_count: usize, } -impl, E: KvEngine> Endpoint { +impl, E: KvEngine, S: StoreRegionMeta> Endpoint { pub fn new( cluster_id: u64, config: &CdcConfig, + raftstore_v2: bool, api_version: ApiVersion, pd_client: Arc, scheduler: Scheduler, - raft_router: T, - engine: E, + cdc_handle: T, + tablets: LocalTablets, observer: CdcObserver, - store_meta: Arc>, + store_meta: Arc>, concurrency_manager: ConcurrencyManager, env: Arc, security_mgr: Arc, sink_memory_quota: MemoryQuota, - ) -> Endpoint { + causal_ts_provider: Option>, + ) -> Endpoint { let workers = Builder::new_multi_thread() .thread_name("cdcwkr") .worker_threads(config.incremental_scan_threads) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); let tso_worker = Builder::new_multi_thread() .thread_name("tso") - .worker_threads(1) + .worker_threads(config.tso_worker_threads) .enable_time() + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); - // Initialized for the first time, subsequent adjustments will be made based on configuration updates. + // Initialized for the first time, subsequent adjustments will be made based on + // configuration updates. let scan_concurrency_semaphore = Arc::new(Semaphore::new(config.incremental_scan_concurrency)); let old_value_cache = OldValueCache::new(config.old_value_cache_memory_quota); @@ -398,11 +410,18 @@ impl, E: KvEngine> Endpoint { // Assume 1KB per entry. let max_scan_batch_size = 1024; - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); - let ep = Endpoint { - cluster_id, + let region_read_progress = store_meta.lock().unwrap().region_read_progress().clone(); + let store_resolver_gc_interval = Duration::from_secs(60); + let leader_resolver = LeadershipResolver::new( + store_meta.lock().unwrap().store_id(), + pd_client.clone(), env, security_mgr, + region_read_progress, + store_resolver_gc_interval, + ); + let ep = Endpoint { + cluster_id, capture_regions: HashMap::default(), connections: HashMap::default(), scheduler, @@ -413,48 +432,53 @@ impl, E: KvEngine> Endpoint { max_scan_batch_bytes, max_scan_batch_size, config: config.clone(), + raftstore_v2, api_version, workers, scan_concurrency_semaphore, - raft_router, - engine, + cdc_handle, + tablets, observer, store_meta, concurrency_manager, min_resolved_ts: TimeStamp::max(), min_ts_region_id: 0, - resolved_region_heap: ResolvedRegionHeap { + resolved_region_heap: RefCell::new(ResolvedRegionHeap { heap: BinaryHeap::new(), - }, + }), old_value_cache, resolved_region_count: 0, unresolved_region_count: 0, sink_memory_quota, - tikv_clients: Arc::new(Mutex::new(HashMap::default())), - region_read_progress, // Log the first resolved ts warning. warn_resolved_ts_repeat_count: WARN_RESOLVED_TS_COUNT_THRESHOLD, + current_ts: TimeStamp::zero(), + causal_ts_provider, }; - ep.register_min_ts_event(); + ep.register_min_ts_event(leader_resolver, Instant::now()); ep } fn on_change_cfg(&mut self, change: ConfigChange) { // Validate first. let mut validate_cfg = self.config.clone(); - validate_cfg.update(change.clone()); - if let Err(e) = validate_cfg.validate() { + if let Err(e) = validate_cfg.update(change) { warn!("cdc config update failed"; "error" => ?e); return; } - + if let Err(e) = validate_cfg.validate(self.raftstore_v2) { + warn!("cdc config update failed"; "error" => ?e); + return; + } + let change = self.config.diff(&validate_cfg); info!( "cdc config updated"; "current config" => ?self.config, "change" => ?change ); - // Update the config here. The following adjustments will all use the new values. - self.config.update(change.clone()); + // Update the config here. The following adjustments will all use the new + // values. + self.config.update(change.clone()).unwrap(); // Maybe the cache will be lost due to smaller capacity, // but it is acceptable. @@ -463,8 +487,8 @@ impl, E: KvEngine> Endpoint { .resize(self.config.old_value_cache_memory_quota); } - // Maybe the limit will be exceeded for a while after the concurrency becomes smaller, - // but it is acceptable. + // Maybe the limit will be exceeded for a while after the concurrency becomes + // smaller, but it is acceptable. if change.get("incremental_scan_concurrency").is_some() { self.scan_concurrency_semaphore = Arc::new(Semaphore::new(self.config.incremental_scan_concurrency)) @@ -520,7 +544,7 @@ impl, E: KvEngine> Endpoint { let oid = self.observer.unsubscribe_region(region_id, id); assert!( oid.is_some(), - "unsubscribe region {} failed, ObserveID {:?}", + "unsubscribe region {} failed, ObserveId {:?}", region_id, id ); @@ -533,7 +557,7 @@ impl, E: KvEngine> Endpoint { } => { // Something went wrong, deregister all downstreams of the region. - // To avoid ABA problem, we must check the unique ObserveID. + // To avoid ABA problem, we must check the unique ObserveId. let need_remove = self .capture_regions .get(®ion_id) @@ -551,7 +575,7 @@ impl, E: KvEngine> Endpoint { assert_eq!( need_remove, oid.is_some(), - "unsubscribe region {} failed, ObserveID {:?}", + "unsubscribe region {} failed, ObserveId {:?}", region_id, observe_id ); @@ -570,7 +594,7 @@ impl, E: KvEngine> Endpoint { let oid = self.observer.unsubscribe_region(region_id, id); assert!( oid.is_some(), - "unsubscribe region {} failed, ObserveID {:?}", + "unsubscribe region {} failed, ObserveId {:?}", region_id, id ); @@ -587,7 +611,7 @@ impl, E: KvEngine> Endpoint { &mut self, mut request: ChangeDataRequest, mut downstream: Downstream, - conn_id: ConnID, + conn_id: ConnId, version: semver::Version, ) { let region_id = request.region_id; @@ -595,6 +619,7 @@ impl, E: KvEngine> Endpoint { let api_version = self.api_version; let downstream_id = downstream.get_id(); let downstream_state = downstream.get_state(); + let filter_loop = downstream.get_filter_loop(); // Register must follow OpenConn, so the connection must be available. let conn = self.connections.get_mut(&conn_id).unwrap(); @@ -624,7 +649,7 @@ impl, E: KvEngine> Endpoint { return; } - let txn_extra_op = match self.store_meta.lock().unwrap().readers.get(®ion_id) { + let txn_extra_op = match self.store_meta.lock().unwrap().reader(region_id) { Some(reader) => reader.txn_extra_op.clone(), None => { error!("cdc register for a not found region"; "region_id" => region_id); @@ -677,10 +702,6 @@ impl, E: KvEngine> Endpoint { let checkpoint_ts = request.checkpoint_ts; let sched = self.scheduler.clone(); - // Now resolver is only used by tidb downstream. - // Resolver is created when the first tidb cdc request arrive. - let is_build_resolver = kv_api == ChangeDataRequestKvApi::TiDb && !delegate.has_resolver(); - let downstream_ = downstream.clone(); if let Err(err) = delegate.subscribe(downstream) { let error_event = err.into_error_event(region_id); @@ -697,7 +718,7 @@ impl, E: KvEngine> Endpoint { let old_observe_id = self.observer.subscribe_region(region_id, observe_id); assert!( old_observe_id.is_none(), - "region {} must not be observed twice, old ObserveID {:?}, new ObserveID {:?}", + "region {} must not be observed twice, old ObserveId {:?}, new ObserveId {:?}", region_id, old_observe_id, observe_id @@ -705,11 +726,12 @@ impl, E: KvEngine> Endpoint { }; let change_cmd = ChangeObserver::from_cdc(region_id, delegate.handle.clone()); - + let observed_range = downstream_.observed_range; let region_epoch = request.take_region_epoch(); let mut init = Initializer { - engine: self.engine.clone(), + tablet: self.tablets.get(region_id).map(|t| t.into_owned()), sched, + observed_range, region_id, region_epoch, conn_id, @@ -722,17 +744,18 @@ impl, E: KvEngine> Endpoint { max_scan_batch_size: self.max_scan_batch_size, observe_id, checkpoint_ts: checkpoint_ts.into(), - build_resolver: is_build_resolver, + build_resolver: is_new_delegate, ts_filter_ratio: self.config.incremental_scan_ts_filter_ratio, kv_api, + filter_loop, }; - let raft_router = self.raft_router.clone(); + let cdc_handle = self.cdc_handle.clone(); let concurrency_semaphore = self.scan_concurrency_semaphore.clone(); self.workers.spawn(async move { CDC_SCAN_TASKS.with_label_values(&["total"]).inc(); match init - .initialize(change_cmd, raft_router, concurrency_semaphore) + .initialize(change_cmd, cdc_handle, concurrency_semaphore) .await { Ok(()) => { @@ -780,7 +803,7 @@ impl, E: KvEngine> Endpoint { flush_oldvalue_stats(&statistics, TAG_DELTA_CHANGE); } - fn on_region_ready(&mut self, observe_id: ObserveID, resolver: Resolver, region: Region) { + fn on_region_ready(&mut self, observe_id: ObserveId, resolver: Resolver, region: Region) { let region_id = region.get_id(); let mut failed_downstreams = Vec::new(); if let Some(delegate) = self.capture_regions.get_mut(®ion_id) { @@ -811,9 +834,9 @@ impl, E: KvEngine> Endpoint { } } - fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp) { + fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp, current_ts: TimeStamp) { // Reset resolved_regions to empty. - let resolved_regions = &mut self.resolved_region_heap; + let mut resolved_regions = self.resolved_region_heap.borrow_mut(); resolved_regions.clear(); let total_region_count = regions.len(); @@ -837,7 +860,6 @@ impl, E: KvEngine> Endpoint { self.min_ts_region_id = region_id; } resolved_regions.push(region_id, resolved_ts); - if resolved_ts == old_resolved_ts { advance_failed_same += 1; } else { @@ -848,6 +870,7 @@ impl, E: KvEngine> Endpoint { } } } + self.current_ts = current_ts; let lag_millis = min_ts .physical() .saturating_sub(self.min_resolved_ts.physical()); @@ -859,6 +882,7 @@ impl, E: KvEngine> Endpoint { "min_resolved_ts" => self.min_resolved_ts, "min_ts_region_id" => self.min_ts_region_id, "min_ts" => min_ts, + "lag" => ?Duration::from_millis(lag_millis), "ok" => advance_ok, "none" => advance_failed_none, "stale" => advance_failed_stale, @@ -872,13 +896,14 @@ impl, E: KvEngine> Endpoint { // so 1) downstreams know where they should send resolve lock requests, // and 2) resolved ts of normal regions does not fallback. // - // Max number of outliers, in most cases, only a few regions are outliers. - // TODO: figure out how to avoid create hashset every time, saving some CPU. - let max_outlier_count = 32; - let (outlier_min_resolved_ts, outlier_regions) = resolved_regions.pop(max_outlier_count); - let (normal_min_resolved_ts, normal_regions) = resolved_regions.to_hash_set(); - self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); - self.broadcast_resolved_ts(normal_min_resolved_ts, normal_regions); + // Regions are separated exponentially to reduce resolved ts events and + // save CPU for both TiKV and TiCDC. + let mut batch_count = 8; + while !resolved_regions.is_empty() { + let (outlier_min_resolved_ts, outlier_regions) = resolved_regions.pop(batch_count); + self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); + batch_count *= 4; + } } fn broadcast_resolved_ts(&self, min_resolved_ts: TimeStamp, regions: HashSet) { @@ -978,35 +1003,45 @@ impl, E: KvEngine> Endpoint { let _ = downstream.sink_event(resolved_ts_event, force_send); } - fn register_min_ts_event(&self) { - let timeout = self.timer.delay(self.config.min_ts_interval.0); + fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver, event_time: Instant) { + // Try to keep advance resolved ts every `min_ts_interval`, thus + // the actual wait interval = `min_ts_interval` - the last register min_ts event + // time. + let interval = self + .config + .min_ts_interval + .0 + .checked_sub(event_time.saturating_elapsed()); + let timeout = self.timer.delay(interval.unwrap_or_default()); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); - let raft_router = self.raft_router.clone(); - let regions: Vec<(u64, ObserveID)> = self - .capture_regions - .iter() - .map(|(region_id, delegate)| (*region_id, delegate.handle.id)) - .collect(); + let cdc_handle = self.cdc_handle.clone(); + let regions: Vec = self.capture_regions.keys().copied().collect(); let cm: ConcurrencyManager = self.concurrency_manager.clone(); - let env = self.env.clone(); - let security_mgr = self.security_mgr.clone(); - let store_meta = self.store_meta.clone(); - let tikv_clients = self.tikv_clients.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; - let region_read_progress = self.region_read_progress.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); + // We use channel to deliver leader_resolver in async block. + let (leader_resolver_tx, leader_resolver_rx) = bounded(1); let fut = async move { let _ = timeout.compat().await; // Ignore get tso errors since we will retry every `min_ts_interval`. - let min_ts_pd = pd_client.get_tso().await.unwrap_or_default(); + let min_ts_pd = match causal_ts_provider { + // TiKV API v2 is enabled when causal_ts_provider is Some. + // In this scenario, get TSO from causal_ts_provider to make sure that + // RawKV write requests will get larger TSO after this point. + // RawKV CDC's resolved_ts is guaranteed by ConcurrencyManager::global_min_lock_ts, + // which lock flying keys's ts in raw put and delete interfaces in `Storage`. + Some(provider) => provider.async_get_ts().await.unwrap_or_default(), + None => pd_client.get_tso().await.unwrap_or_default(), + }; let mut min_ts = min_ts_pd; let mut min_ts_min_lock = min_ts_pd; - // Sync with concurrency manager so that it can work correctly when optimizations - // like async commit is enabled. - // Note: This step must be done before scheduling `Task::MinTS` task, and the - // resolver must be checked in or after `Task::MinTS`' execution. + // Sync with concurrency manager so that it can work correctly when + // optimizations like async commit is enabled. + // Note: This step must be done before scheduling `Task::MinTs` task, and the + // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { if min_mem_lock_ts < min_ts { @@ -1015,42 +1050,47 @@ impl, E: KvEngine> Endpoint { min_ts_min_lock = min_mem_lock_ts; } - match scheduler.schedule(Task::RegisterMinTsEvent) { - Ok(_) | Err(ScheduleError::Stopped(_)) => (), - // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not - // advance normally. - Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), - } + let slow_timer = SlowTimer::default(); + defer!({ + slow_log!(T slow_timer, "cdc resolve region leadership"); + if let Ok(leader_resolver) = leader_resolver_rx.try_recv() { + match scheduler.schedule(Task::RegisterMinTsEvent { + leader_resolver, + event_time: Instant::now(), + }) { + Ok(_) | Err(ScheduleError::Stopped(_)) => (), + // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not + // advance normally. + Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), + } + } else { + // During shutdown, tso runtime drops future immediately, + // leader_resolver may be lost when this future drops before + // delivering leader_resolver. + warn!("cdc leader resolver is lost, are we shutdown?"); + } + }); + // Check region peer leadership, make sure they are leaders. let gate = pd_client.feature_gate(); - let regions = if hibernate_regions_compatible && gate.can_enable(FEATURE_RESOLVED_TS_STORE) { CDC_RESOLVED_TS_ADVANCE_METHOD.set(1); - let regions = regions - .into_iter() - .map(|(region_id, _)| region_id) - .collect(); - resolved_ts::region_resolved_ts_store( - regions, - store_meta, - region_read_progress, - pd_client, - security_mgr, - env, - tikv_clients, - min_ts, - ) - .await + leader_resolver.resolve(regions, min_ts).await } else { CDC_RESOLVED_TS_ADVANCE_METHOD.set(0); - Self::region_resolved_ts_raft(regions, &scheduler, raft_router, min_ts).await + resolve_by_raft(regions, min_ts, cdc_handle).await }; + leader_resolver_tx.send(leader_resolver).unwrap(); if !regions.is_empty() { - match scheduler.schedule(Task::MinTS { regions, min_ts }) { + match scheduler.schedule(Task::MinTs { + regions, + min_ts, + current_ts: min_ts_pd, + }) { Ok(_) | Err(ScheduleError::Stopped(_)) => (), - // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not + // Must schedule `MinTS` event otherwise resolved ts can not // advance normally. Err(err) => panic!("failed to schedule min ts event, error: {:?}", err), } @@ -1066,67 +1106,25 @@ impl, E: KvEngine> Endpoint { self.tso_worker.spawn(fut); } - async fn region_resolved_ts_raft( - regions: Vec<(u64, ObserveID)>, - scheduler: &Scheduler, - raft_router: T, - min_ts: TimeStamp, - ) -> Vec { - // TODO: send a message to raftstore would consume too much cpu time, - // try to handle it outside raftstore. - let regions: Vec<_> = regions - .iter() - .copied() - .map(|(region_id, observe_id)| { - let scheduler_clone = scheduler.clone(); - let raft_router_clone = raft_router.clone(); - async move { - let (tx, rx) = tokio::sync::oneshot::channel(); - if let Err(e) = raft_router_clone.significant_send( - region_id, - SignificantMsg::LeaderCallback(Callback::Read(Box::new(move |resp| { - let resp = if resp.response.get_header().has_error() { - None - } else { - Some(region_id) - }; - if tx.send(resp).is_err() { - error!("cdc send tso response failed"; "region_id" => region_id); - } - }))), - ) { - warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); - let deregister = Deregister::Delegate { - observe_id, - region_id, - err: Error::request(e.into()), - }; - if let Err(e) = scheduler_clone.schedule(Task::Deregister(deregister)) { - error!("cdc schedule cdc task failed"; "error" => ?e); - } - return None; - } - rx.await.unwrap_or(None) - } - }) - .collect(); - let resps = futures::future::join_all(regions).await; - resps.into_iter().flatten().collect::>() - } - fn on_open_conn(&mut self, conn: Conn) { self.connections.insert(conn.get_id(), conn); } } -impl, E: KvEngine> Runnable for Endpoint { +impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable + for Endpoint +{ type Task = Task; fn run(&mut self, task: Task) { debug!("cdc run task"; "task" => %task); match task { - Task::MinTS { regions, min_ts } => self.on_min_ts(regions, min_ts), + Task::MinTs { + regions, + min_ts, + current_ts, + } => self.on_min_ts(regions, min_ts, current_ts), Task::Register { request, downstream, @@ -1144,7 +1142,10 @@ impl, E: KvEngine> Runnable for Endpoint { old_value_cb, } => self.on_multi_batch(multi, old_value_cb), Task::OpenConn { conn } => self.on_open_conn(conn), - Task::RegisterMinTsEvent => self.register_min_ts_event(), + Task::RegisterMinTsEvent { + leader_resolver, + event_time, + } => self.register_min_ts_event(leader_resolver, event_time), Task::InitDownstream { region_id, downstream_id, @@ -1188,12 +1189,15 @@ impl, E: KvEngine> Runnable for Endpoint { } } -impl, E: KvEngine> RunnableWithTimer for Endpoint { +impl, E: KvEngine, S: StoreRegionMeta + Send> RunnableWithTimer + for Endpoint +{ fn on_timeout(&mut self) { CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); // Reclaim resolved_region_heap memory. self.resolved_region_heap + .borrow_mut() .reset_and_shrink_to(self.capture_regions.len()); CDC_CAPTURED_REGION_COUNT.set(self.capture_regions.len() as i64); @@ -1206,8 +1210,20 @@ impl, E: KvEngine> RunnableWithTimer for Endpoin if self.min_resolved_ts != TimeStamp::max() { CDC_MIN_RESOLVED_TS_REGION.set(self.min_ts_region_id as i64); CDC_MIN_RESOLVED_TS.set(self.min_resolved_ts.physical() as i64); + CDC_MIN_RESOLVED_TS_LAG.set( + self.current_ts + .physical() + .saturating_sub(self.min_resolved_ts.physical()) as i64, + ); + CDC_RESOLVED_TS_GAP_HISTOGRAM.observe( + self.current_ts + .physical() + .saturating_sub(self.min_resolved_ts.physical()) as f64 + / 1000f64, + ); } self.min_resolved_ts = TimeStamp::max(); + self.current_ts = TimeStamp::max(); self.min_ts_region_id = 0; self.old_value_cache.flush_metrics(); @@ -1243,15 +1259,18 @@ mod tests { use std::ops::{Deref, DerefMut}; use engine_rocks::RocksEngine; + use futures::executor::block_on; use kvproto::{ cdcpb::{ChangeDataRequestKvApi, Header}, errorpb::Error as ErrorHeader, }; use raftstore::{ errors::{DiscardReason, Error as RaftStoreError}, - store::{msg::CasualMessage, PeerMsg, ReadDelegate}, + router::{CdcRaftRouter, RaftStoreRouter}, + store::{fsm::StoreMeta, msg::CasualMessage, PeerMsg, ReadDelegate}, }; - use test_raftstore::{MockRaftStoreRouter, TestPdClient}; + use test_pd_client::TestPdClient; + use test_raftstore::MockRaftStoreRouter; use tikv::{ server::DEFAULT_CLUSTER_ID, storage::{kv::Engine, TestEngineBuilder}, @@ -1262,21 +1281,26 @@ mod tests { }; use super::*; - use crate::{channel, recv_timeout}; + use crate::{ + channel, + delegate::{post_init_downstream, ObservedRange}, + recv_timeout, + }; struct TestEndpointSuite { // The order must ensure `endpoint` be dropped before other fields. - endpoint: Endpoint, - raft_router: MockRaftStoreRouter, + endpoint: Endpoint, RocksEngine, StoreMeta>, + cdc_handle: CdcRaftRouter, task_rx: ReceiverWrapper, raft_rxs: HashMap>>, + leader_resolver: Option, } impl TestEndpointSuite { // It's important to matain raft receivers in `raft_rxs`, otherwise all cases // need to drop `endpoint` and `rx` in order manually. fn add_region(&mut self, region_id: u64, cap: usize) { - let rx = self.raft_router.add_region(region_id, cap); + let rx = self.cdc_handle.add_region(region_id, cap); self.raft_rxs.insert(region_id, rx); self.add_local_reader(region_id); } @@ -1290,7 +1314,7 @@ mod tests { } fn fill_raft_rx(&self, region_id: u64) { - let router = &self.raft_router; + let router = &self.cdc_handle; loop { match router.send_casual_msg(region_id, CasualMessage::ClearRegionSize) { Ok(_) => continue, @@ -1306,7 +1330,7 @@ mod tests { } impl Deref for TestEndpointSuite { - type Target = Endpoint; + type Target = Endpoint, RocksEngine, StoreMeta>; fn deref(&self) -> &Self::Target { &self.endpoint } @@ -1322,41 +1346,72 @@ mod tests { cfg: &CdcConfig, engine: Option, api_version: ApiVersion, + ) -> TestEndpointSuite { + mock_endpoint_with_ts_provider(cfg, engine, api_version, None) + } + + fn mock_endpoint_with_ts_provider( + cfg: &CdcConfig, + engine: Option, + api_version: ApiVersion, + causal_ts_provider: Option>, ) -> TestEndpointSuite { let (task_sched, task_rx) = dummy_scheduler(); - let raft_router = MockRaftStoreRouter::new(); + let cdc_handle = CdcRaftRouter(MockRaftStoreRouter::new()); + let mut store_meta = StoreMeta::new(0); + store_meta.store_id = Some(1); + let region_read_progress = store_meta.region_read_progress.clone(); + let pd_client = Arc::new(TestPdClient::new(0, true)); + let env = Arc::new(Environment::new(1)); + let security_mgr = Arc::new(SecurityManager::default()); + let store_resolver_gc_interval = Duration::from_secs(60); + let leader_resolver = LeadershipResolver::new( + 1, + pd_client.clone(), + env.clone(), + security_mgr.clone(), + region_read_progress, + store_resolver_gc_interval, + ); let ep = Endpoint::new( DEFAULT_CLUSTER_ID, cfg, + false, api_version, - Arc::new(TestPdClient::new(0, true)), + pd_client, task_sched.clone(), - raft_router.clone(), - engine.unwrap_or_else(|| { + cdc_handle.clone(), + LocalTablets::Singleton(engine.unwrap_or_else(|| { TestEngineBuilder::new() .build_without_cache() .unwrap() .kv_engine() - }), + .unwrap() + })), CdcObserver::new(task_sched), - Arc::new(StdMutex::new(StoreMeta::new(0))), + Arc::new(StdMutex::new(store_meta)), ConcurrencyManager::new(1.into()), - Arc::new(Environment::new(1)), - Arc::new(SecurityManager::default()), + env, + security_mgr, MemoryQuota::new(usize::MAX), + causal_ts_provider, ); TestEndpointSuite { endpoint: ep, - raft_router, + cdc_handle, task_rx, raft_rxs: HashMap::default(), + leader_resolver: Some(leader_resolver), } } #[test] fn test_api_version_check() { - let cfg = CdcConfig::default(); + let mut cfg = CdcConfig::default(); + // To make the case more stable. + cfg.min_ts_interval = ReadableDuration(Duration::from_secs(1)); + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = crate::channel::MemoryQuota::new(usize::MAX); @@ -1381,6 +1436,8 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::RawKv, + false, + ObservedRange::default(), ); req.set_kv_api(ChangeDataRequestKvApi::RawKv); suite.run(Task::Register { @@ -1416,6 +1473,8 @@ mod tests { 2, conn_id, ChangeDataRequestKvApi::TxnKv, + false, + ObservedRange::default(), ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1452,6 +1511,8 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TxnKv, + false, + ObservedRange::default(), ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1542,13 +1603,14 @@ mod tests { let mut updated_cfg = cfg.clone(); { // Update it to be smaller than incremental_scan_threads, - // which will be an invalid change and will be lost. + // which will be an invalid change and will modified to + // incremental_scan_threads. updated_cfg.incremental_scan_concurrency = 2; } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.incremental_scan_concurrency, 6); - assert_eq!(ep.scan_concurrency_semaphore.available_permits(), 6); + assert_eq!(ep.config.incremental_scan_concurrency, 4); + assert_eq!(ep.scan_concurrency_semaphore.available_permits(), 4); { // Correct update. @@ -1629,6 +1691,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -1675,6 +1739,8 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); @@ -1697,6 +1763,8 @@ mod tests { 2, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req.clone(), @@ -1733,6 +1801,8 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -1777,6 +1847,8 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.add_local_reader(100); suite.run(Task::Register { @@ -1808,6 +1880,8 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -1829,6 +1903,32 @@ mod tests { } } + #[test] + fn test_raw_causal_min_ts() { + let sleep_interval = Duration::from_secs(1); + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(sleep_interval), + ..Default::default() + }; + let ts_provider: Arc = + Arc::new(causal_ts::tests::TestProvider::default().into()); + let start_ts = block_on(ts_provider.async_get_ts()).unwrap(); + let mut suite = + mock_endpoint_with_ts_provider(&cfg, None, ApiVersion::V2, Some(ts_provider.clone())); + let leader_resolver = suite.leader_resolver.take().unwrap(); + suite.run(Task::RegisterMinTsEvent { + leader_resolver, + event_time: Instant::now(), + }); + suite + .task_rx + .recv_timeout(Duration::from_millis(1500)) + .unwrap() + .unwrap(); + let end_ts = block_on(ts_provider.async_get_ts()).unwrap(); + assert!(end_ts.into_inner() > start_ts.next().into_inner()); // may trigger more than once. + } + #[test] fn test_feature_gate() { let cfg = CdcConfig { @@ -1857,6 +1957,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); // Enable batch resolved ts in the test. @@ -1870,9 +1972,10 @@ mod tests { let resolver = Resolver::new(1); let observe_id = suite.endpoint.capture_regions[&1].handle.id; suite.on_region_ready(observe_id, resolver, region.clone()); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1], min_ts: TimeStamp::from(1), + current_ts: TimeStamp::zero(), }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1892,6 +1995,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(2, 100); @@ -1905,9 +2010,10 @@ mod tests { region.set_id(2); let observe_id = suite.endpoint.capture_regions[&2].handle.id; suite.on_region_ready(observe_id, resolver, region); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2], min_ts: TimeStamp::from(2), + current_ts: TimeStamp::zero(), }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1936,6 +2042,8 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(3, 100); @@ -1949,9 +2057,10 @@ mod tests { region.set_id(3); let observe_id = suite.endpoint.capture_regions[&3].handle.id; suite.on_region_ready(observe_id, resolver, region); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), + current_ts: TimeStamp::zero(), }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -2005,6 +2114,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); let downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2047,6 +2158,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); let new_downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2064,7 +2177,7 @@ mod tests { err: Some(Error::request(err_header.clone())), }; suite.run(Task::Deregister(deregister)); - assert!(channel::recv_timeout(&mut rx, Duration::from_millis(200)).is_err()); + channel::recv_timeout(&mut rx, Duration::from_millis(200)).unwrap_err(); assert_eq!(suite.endpoint.capture_regions.len(), 1); let deregister = Deregister::Downstream { @@ -2098,6 +2211,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -2108,8 +2223,8 @@ mod tests { assert_eq!(suite.endpoint.capture_regions.len(), 1); let deregister = Deregister::Delegate { region_id: 1, - // A stale ObserveID (different from the actual one). - observe_id: ObserveID::new(), + // A stale ObserveId (different from the actual one). + observe_id: ObserveId::new(), err: Error::request(err_header), }; suite.run(Task::Deregister(deregister)); @@ -2152,6 +2267,8 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); suite.run(Task::Register { @@ -2182,9 +2299,10 @@ mod tests { } }; - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1], min_ts: TimeStamp::from(1), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that only contains region 1. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1], 1); @@ -2195,9 +2313,10 @@ mod tests { ) .unwrap_err(); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2], min_ts: TimeStamp::from(2), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that contains region 1 and region 2. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1, 2], 2); @@ -2208,18 +2327,20 @@ mod tests { ) .unwrap_err(); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that contains region 1 and region 2. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1, 2], 3); // conn b must receive a resolved ts that contains region 3. assert_batch_resolved_ts(conn_rxs.get_mut(1).unwrap(), vec![3], 3); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 3], min_ts: TimeStamp::from(4), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that only contains region 1. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1], 4); @@ -2265,6 +2386,8 @@ mod tests { 0, conn_id_a, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req.clone(), @@ -2288,6 +2411,8 @@ mod tests { 0, conn_id_b, ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), ); suite.run(Task::Register { request: req.clone(), @@ -2362,11 +2487,6 @@ mod tests { assert!(regions.contains(&5)); assert!(regions.contains(&6)); - // Empty regions - let (ts, regions) = heap.to_hash_set(); - assert_eq!(ts, TimeStamp::max()); - assert!(regions.is_empty()); - let mut heap1 = ResolvedRegionHeap { heap: BinaryHeap::new(), }; @@ -2380,13 +2500,6 @@ mod tests { assert_eq!(regions.len(), 1); assert!(regions.contains(&3)); - let (ts, regions) = heap1.to_hash_set(); - assert_eq!(ts, 4.into()); - assert_eq!(regions.len(), 3); - assert!(regions.contains(&4)); - assert!(regions.contains(&5)); - assert!(regions.contains(&6)); - heap1.reset_and_shrink_to(3); assert_eq!(3, heap1.heap.capacity()); assert!(heap1.heap.is_empty()); @@ -2395,4 +2508,92 @@ mod tests { heap1.clear(); assert!(heap1.heap.is_empty()); } + + #[test] + fn test_on_min_ts() { + let cfg = CdcConfig { + // Disable automatic advance resolved ts during test. + min_ts_interval: ReadableDuration(Duration::from_secs(1000)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); + let quota = crate::channel::MemoryQuota::new(usize::MAX); + let (tx, mut rx) = channel::channel(1, quota); + let mut rx = rx.drain(); + + let conn = Conn::new(tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + let mut req_header = Header::default(); + req_header.set_cluster_id(0); + + let mut regions = vec![]; + for id in 1..4097 { + regions.push(id); + suite.add_region(id, 100); + + let mut req = ChangeDataRequest::default(); + req.set_region_id(id); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + id, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + on_init_downstream(&downstream.get_state()); + post_init_downstream(&downstream.get_state()); + // Enable batch resolved ts in the test. + let version = FeatureGate::batch_resolved_ts(); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + version: version.clone(), + }); + + let mut resolver = Resolver::new(id); + resolver.track_lock(TimeStamp::compose(0, id), vec![], None); + let mut region = Region::default(); + region.id = id; + region.set_region_epoch(region_epoch); + let failed = suite + .capture_regions + .get_mut(&id) + .unwrap() + .on_region_ready(resolver, region); + assert!(failed.is_empty()); + } + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + suite.run(Task::MinTs { + regions, + min_ts: TimeStamp::compose(0, 4096), + current_ts: TimeStamp::compose(0, 4096), + }); + + // There should be at least 3 resolved ts events. + let mut last_resolved_ts = 0; + let mut last_batch_count = 0; + for _ in 0..3 { + let event = recv_timeout(&mut rx, Duration::from_millis(100)) + .unwrap() + .unwrap() + .0; + assert!(last_resolved_ts < event.resolved_ts().ts, "{:?}", event); + assert!( + last_batch_count < event.resolved_ts().regions.len(), + "{:?}", + event + ); + last_resolved_ts = event.resolved_ts().ts; + last_batch_count = event.resolved_ts().regions.len(); + } + } } diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 9a06448afba..c06b13424ba 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -16,11 +16,11 @@ use kvproto::{ metapb::{Region, RegionEpoch}, }; use raftstore::{ - coprocessor::ObserveID, - router::RaftStoreRouter, + coprocessor::ObserveId, + router::CdcHandle, store::{ fsm::ChangeObserver, - msg::{Callback, ReadResponse, SignificantMsg}, + msg::{Callback, ReadResponse}, }, }; use resolved_ts::Resolver; @@ -47,11 +47,11 @@ use txn_types::{Key, KvPair, Lock, LockType, OldValue, TimeStamp}; use crate::{ channel::CdcEvent, - delegate::{post_init_downstream, Delegate, DownstreamID, DownstreamState}, + delegate::{post_init_downstream, Delegate, DownstreamId, DownstreamState, ObservedRange}, endpoint::Deregister, metrics::*, old_value::{near_seek_old_value, new_old_value_cursor, OldValueCursors}, - service::ConnID, + service::ConnId, Error, Result, Task, }; @@ -75,16 +75,17 @@ pub(crate) enum Scanner { } pub(crate) struct Initializer { - pub(crate) engine: E, + pub(crate) tablet: Option, pub(crate) sched: Scheduler, pub(crate) sink: crate::channel::Sink, + pub(crate) observed_range: ObservedRange, pub(crate) region_id: u64, pub(crate) region_epoch: RegionEpoch, - pub(crate) observe_id: ObserveID, - pub(crate) downstream_id: DownstreamID, + pub(crate) observe_id: ObserveId, + pub(crate) downstream_id: DownstreamId, pub(crate) downstream_state: Arc>, - pub(crate) conn_id: ConnID, + pub(crate) conn_id: ConnId, pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, @@ -96,13 +97,15 @@ pub(crate) struct Initializer { pub(crate) ts_filter_ratio: f64, pub(crate) kv_api: ChangeDataRequestKvApi, + + pub(crate) filter_loop: bool, } impl Initializer { - pub(crate) async fn initialize>( + pub(crate) async fn initialize>( &mut self, - change_cmd: ChangeObserver, - raft_router: T, + change_observer: ChangeObserver, + cdc_handle: T, concurrency_semaphore: Arc, ) -> Result<()> { fail_point!("cdc_before_initialize"); @@ -139,24 +142,22 @@ impl Initializer { let (incremental_scan_barrier_cb, incremental_scan_barrier_fut) = tikv_util::future::paired_future_callback(); let barrier = CdcEvent::Barrier(Some(incremental_scan_barrier_cb)); - if let Err(e) = raft_router.significant_send( + if let Err(e) = cdc_handle.capture_change( self.region_id, - SignificantMsg::CaptureChange { - cmd: change_cmd, - region_epoch, - callback: Callback::Read(Box::new(move |resp| { - if let Err(e) = sched.schedule(Task::InitDownstream { - region_id, - downstream_id, - downstream_state, - sink, - incremental_scan_barrier: barrier, - cb: Box::new(move || cb(resp)), - }) { - error!("cdc schedule cdc task failed"; "error" => ?e); - } - })), - }, + region_epoch, + change_observer, + Callback::read(Box::new(move |resp| { + if let Err(e) = sched.schedule(Task::InitDownstream { + region_id, + downstream_id, + downstream_state, + sink, + incremental_scan_barrier: barrier, + cb: Box::new(move || cb(resp)), + }) { + error!("cdc schedule cdc task failed"; "error" => ?e); + } + })), ) { warn!("cdc send capture change cmd failed"; "region_id" => self.region_id, "error" => ?e); @@ -204,10 +205,12 @@ impl Initializer { let region_id = region.get_id(); let observe_id = self.observe_id; let kv_api = self.kv_api; + self.observed_range.update_region_key_range(®ion); debug!("cdc async incremental scan"; "region_id" => region_id, "downstream_id" => ?downstream_id, "observe_id" => ?self.observe_id, + "all_key_covered" => ?self.observed_range.all_key_covered, "start_key" => log_wrappers::Value::key(snap.lower_bound().unwrap_or_default()), "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); @@ -237,10 +240,13 @@ impl Initializer { Scanner::TxnKvScanner(txnkv_scanner) } else { let mut iter_opt = IterOptions::default(); + iter_opt.set_fill_cache(false); let (raw_key_prefix, raw_key_prefix_end) = ApiV2::get_rawkv_range(); iter_opt.set_lower_bound(&[raw_key_prefix], DATA_KEY_PREFIX_LEN); iter_opt.set_upper_bound(&[raw_key_prefix_end], DATA_KEY_PREFIX_LEN); - let mut iter = RawMvccSnapshot::from_snapshot(snap).iter(iter_opt).unwrap(); + let mut iter = RawMvccSnapshot::from_snapshot(snap) + .iter(CF_DEFAULT, iter_opt) + .unwrap(); iter.seek_to_first()?; Scanner::RawKvScanner(iter) @@ -303,8 +309,9 @@ impl Initializer { Ok(()) } - // It's extracted from `Initializer::scan_batch` to avoid becoming an asynchronous block, - // so that we can limit scan speed based on the thread disk I/O or RocksDB block read bytes. + // It's extracted from `Initializer::scan_batch` to avoid becoming an + // asynchronous block, so that we can limit scan speed based on the thread + // disk I/O or RocksDB block read bytes. fn do_scan( &self, scanner: &mut Scanner, @@ -421,8 +428,13 @@ impl Initializer { async fn sink_scan_events(&mut self, entries: Vec>, done: bool) -> Result<()> { let mut barrier = None; - let mut events = - Delegate::convert_to_grpc_events(self.region_id, self.request_id, entries)?; + let mut events = Delegate::convert_to_grpc_events( + self.region_id, + self.request_id, + entries, + self.filter_loop, + &self.observed_range, + )?; if done { let (cb, fut) = tikv_util::future::paired_future_callback(); events.push(CdcEvent::Barrier(Some(cb))); @@ -470,10 +482,10 @@ impl Initializer { pub(crate) fn deregister_downstream(&self, err: Error) { let deregister = if self.build_resolver || err.has_region_error() { // Deregister delegate on the conditions, - // * It fails to build a resolver. A delegate requires a resolver - // to advance resolved ts. - // * A region error. It usually mean a peer is not leader or - // a leader meets an error and can not serve. + // * It fails to build a resolver. A delegate requires a resolver to advance + // resolved ts. + // * A region error. It usually mean a peer is not leader or a leader meets an + // error and can not serve. Deregister::Delegate { region_id: self.region_id, observe_id: self.observe_id, @@ -501,7 +513,11 @@ impl Initializer { let start_key = data_key(snap.lower_bound().unwrap_or_default()); let end_key = data_end_key(snap.upper_bound().unwrap_or_default()); let range = Range::new(&start_key, &end_key); - let collection = match self.engine.table_properties_collection(CF_WRITE, &[range]) { + let tablet = match self.tablet.as_ref() { + Some(t) => t, + None => return false, + }; + let collection = match tablet.table_properties_collection(CF_WRITE, &[range]) { Ok(collection) => collection, Err(_) => return false, }; @@ -522,7 +538,7 @@ impl Initializer { }); let valid_count = total_count - filtered_count; - let use_ts_filter = valid_count as f64 / total_count as f64 <= self.ts_filter_ratio; + let use_ts_filter = valid_count as f64 <= total_count as f64 * self.ts_filter_ratio; info!("cdc incremental scan uses ts filter: {}", use_ts_filter; "region_id" => self.region_id, "hint_min_ts" => hint_min_ts, @@ -554,20 +570,28 @@ mod tests { use engine_rocks::RocksEngine; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; - use kvproto::{cdcpb::Event_oneof_event, errorpb::Error as ErrorHeader}; - use raftstore::{coprocessor::ObserveHandle, store::RegionSnapshot}; + use kvproto::{ + cdcpb::{EventLogType, Event_oneof_event}, + errorpb::Error as ErrorHeader, + }; + use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; use test_raftstore::MockRaftStoreRouter; use tikv::storage::{ kv::Engine, txn::tests::{ must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, must_prewrite_put, + must_prewrite_put_with_txn_soucre, }, TestEngineBuilder, }; - use tikv_util::worker::{LazyWorker, Runnable}; + use tikv_util::{ + sys::thread::ThreadBuildWrapper, + worker::{LazyWorker, Runnable}, + }; use tokio::runtime::{Builder, Runtime}; use super::*; + use crate::txn_source::TxnSource; struct ReceiverRunnable { tx: Sender, @@ -594,6 +618,7 @@ mod tests { buffer: usize, engine: Option, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, ) -> ( LazyWorker, Runtime, @@ -608,11 +633,13 @@ mod tests { let pool = Builder::new_multi_thread() .thread_name("test-initializer-worker") .worker_threads(4) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); let downstream_state = Arc::new(AtomicCell::new(DownstreamState::Initializing)); let initializer = Initializer { - engine: engine.unwrap_or_else(|| { + tablet: engine.or_else(|| { TestEngineBuilder::new() .build_without_cache() .unwrap() @@ -620,13 +647,13 @@ mod tests { }), sched: receiver_worker.scheduler(), sink, - + observed_range: ObservedRange::default(), region_id: 1, region_epoch: RegionEpoch::default(), - observe_id: ObserveID::new(), - downstream_id: DownstreamID::new(), + observe_id: ObserveId::new(), + downstream_id: DownstreamId::new(), downstream_state, - conn_id: ConnID::new(), + conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), speed_limiter: Limiter::new(speed_limit as _), @@ -635,6 +662,7 @@ mod tests { build_resolver: true, ts_filter_ratio: 1.0, // always enable it. kv_api, + filter_loop, }; (receiver_worker, pool, initializer, rx, drain) @@ -642,17 +670,23 @@ mod tests { #[test] fn test_initializer_build_resolver() { - let engine = TestEngineBuilder::new().build_without_cache().unwrap(); + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); let mut expected_locks = BTreeMap::>>::new(); + // Only observe ["", "b\0x90"] + let observed_range = ObservedRange::new( + Key::from_raw(&[]).into_encoded(), + Key::from_raw(&[b'k', 90]).into_encoded(), + ) + .unwrap(); let mut total_bytes = 0; // Pessimistic locks should not be tracked for i in 0..10 { let k = &[b'k', i]; total_bytes += k.len(); let ts = TimeStamp::new(i as _); - must_acquire_pessimistic_lock(&engine, k, k, ts, ts); + must_acquire_pessimistic_lock(&mut engine, k, k, ts, ts); } for i in 10..100 { @@ -660,7 +694,7 @@ mod tests { total_bytes += k.len(); total_bytes += v.len(); let ts = TimeStamp::new(i as _); - must_prewrite_put(&engine, k, v, k, ts); + must_prewrite_put(&mut engine, k, v, k, ts); expected_locks .entry(ts) .or_default() @@ -674,9 +708,11 @@ mod tests { let (mut worker, pool, mut initializer, rx, mut drain) = mock_initializer( total_bytes, buffer, - Some(engine.kv_engine()), + engine.kv_engine(), ChangeDataRequestKvApi::TiDb, + false, ); + initializer.observed_range = observed_range.clone(); let check_result = || loop { let task = rx.recv().unwrap(); match task { @@ -690,7 +726,14 @@ mod tests { // To not block test by barrier. pool.spawn(async move { let mut d = drain.drain(); - while d.next().await.is_some() {} + while let Some((e, _)) = d.next().await { + if let CdcEvent::Event(e) = e { + for e in e.get_entries().get_entries() { + let key = Key::from_raw(&e.key).into_encoded(); + assert!(observed_range.contains_encoded_key(&key), "{:?}", e); + } + } + } }); block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); @@ -744,13 +787,86 @@ mod tests { worker.stop(); } + fn test_initializer_txn_source_filter(txn_source: TxnSource, filter_loop: bool) { + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); + + let mut total_bytes = 0; + for i in 10..100 { + let (k, v) = (&[b'k', i], &[b'v', i]); + total_bytes += k.len(); + total_bytes += v.len(); + let ts = TimeStamp::new(i as _); + must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, txn_source.into()); + } + + let snap = engine.snapshot(Default::default()).unwrap(); + // Buffer must be large enough to unblock async incremental scan. + let buffer = 1000; + let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + total_bytes, + buffer, + engine.kv_engine(), + ChangeDataRequestKvApi::TiDb, + filter_loop, + ); + let th = pool.spawn(async move { + initializer + .async_incremental_scan(snap, Region::default()) + .await + .unwrap(); + }); + let mut drain = drain.drain(); + while let Some((event, _)) = block_on(drain.next()) { + let event = match event { + CdcEvent::Event(x) if x.event.is_some() => x.event.unwrap(), + _ => continue, + }; + let entries = match event { + Event_oneof_event::Entries(mut x) => x.take_entries().into_vec(), + _ => continue, + }; + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].get_type(), EventLogType::Initialized); + } + block_on(th).unwrap(); + worker.stop(); + } + + #[test] + fn test_initializer_cdc_write_filter() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + test_initializer_txn_source_filter(txn_source, true); + } + + #[test] + fn test_initializer_lossy_ddl_filter() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is false, we should still ignore lossy + // ddl changes. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is true, we should still ignore all + // events. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, true); + } + // Test `hint_min_ts` works fine with `ExtraOp::ReadOldValue`. // Whether `DeltaScanner` emits correct old values or not is already tested by // another case `test_old_value_with_hint_min_ts`, so here we only care about // handling `OldValue::SeekWrite` with `OldValueReader`. #[test] fn test_incremental_scanner_with_hint_min_ts() { - let engine = TestEngineBuilder::new().build_without_cache().unwrap(); + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); let v_suffix = |suffix: usize| -> Vec { let suffix = suffix.to_string().into_bytes(); @@ -760,14 +876,19 @@ mod tests { v }; - let check_handling_old_value_seek_write = || { + fn check_handling_old_value_seek_write(engine: &mut E, v_suffix: F) + where + E: Engine, + F: Fn(usize) -> Vec, + { // Do incremental scan with different `hint_min_ts` values. for checkpoint_ts in [200, 100, 150] { let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( usize::MAX, 1000, - Some(engine.kv_engine()), + engine.kv_engine(), ChangeDataRequestKvApi::TiDb, + false, ); initializer.checkpoint_ts = checkpoint_ts.into(); let mut drain = drain.drain(); @@ -797,29 +918,42 @@ mod tests { block_on(th).unwrap(); worker.stop(); } - }; + } // Create the initial data with CF_WRITE L0: |zkey_110, zkey1_160| - must_prewrite_put(&engine, b"zkey", &v_suffix(100), b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); - must_prewrite_put(&engine, b"zzzz", &v_suffix(150), b"zzzz", 150); - must_commit(&engine, b"zzzz", 150, 160); - engine.kv_engine().flush_cf(CF_WRITE, true).unwrap(); - must_prewrite_delete(&engine, b"zkey", b"zkey", 200); - check_handling_old_value_seek_write(); // For TxnEntry::Prewrite. + must_prewrite_put(&mut engine, b"zkey", &v_suffix(100), b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zzzz", &v_suffix(150), b"zzzz", 150); + must_commit(&mut engine, b"zzzz", 150, 160); + engine + .kv_engine() + .unwrap() + .flush_cf(CF_WRITE, true) + .unwrap(); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 200); + check_handling_old_value_seek_write(&mut engine, v_suffix); // For TxnEntry::Prewrite. // CF_WRITE L0: |zkey_110, zkey1_160|, |zkey_210| - must_commit(&engine, b"zkey", 200, 210); - engine.kv_engine().flush_cf(CF_WRITE, false).unwrap(); - check_handling_old_value_seek_write(); // For TxnEntry::Commit. + must_commit(&mut engine, b"zkey", 200, 210); + engine + .kv_engine() + .unwrap() + .flush_cf(CF_WRITE, false) + .unwrap(); + check_handling_old_value_seek_write(&mut engine, v_suffix); // For TxnEntry::Commit. } #[test] fn test_initializer_deregister_downstream() { let total_bytes = 1; let buffer = 1; - let (mut worker, _pool, mut initializer, rx, _drain) = - mock_initializer(total_bytes, buffer, None, ChangeDataRequestKvApi::TiDb); + let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + total_bytes, + buffer, + None, + ChangeDataRequestKvApi::TiDb, + false, + ); // Errors reported by region should deregister region. initializer.build_resolver = false; @@ -869,10 +1003,10 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, pool, mut initializer, _rx, _drain) = - mock_initializer(total_bytes, buffer, None, kv_api); + mock_initializer(total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); - let raft_router = MockRaftStoreRouter::new(); + let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); let concurrency_semaphore = Arc::new(Semaphore::new(1)); initializer.downstream_state.store(DownstreamState::Stopped); diff --git a/components/cdc/src/lib.rs b/components/cdc/src/lib.rs index 7d63bf5c115..c913cefb92e 100644 --- a/components/cdc/src/lib.rs +++ b/components/cdc/src/lib.rs @@ -13,6 +13,7 @@ pub mod metrics; mod observer; mod old_value; mod service; +mod txn_source; pub use channel::{recv_timeout, CdcEvent, MemoryQuota}; pub use config::CdcConfigManager; diff --git a/components/cdc/src/metrics.rs b/components/cdc/src/metrics.rs index 55a0124e567..5db91572112 100644 --- a/components/cdc/src/metrics.rs +++ b/components/cdc/src/metrics.rs @@ -8,9 +8,9 @@ use prometheus::*; use prometheus_static_metric::*; use tikv::storage::Statistics; -/// Installing a new capture contains 2 phases, one for incremental scanning and one for -/// fetching delta changes from raftstore. They can share some similar metrics, in which -/// case we can use this tag to distinct them. +/// Installing a new capture contains 2 phases, one for incremental scanning and +/// one for fetching delta changes from raftstore. They can share some similar +/// metrics, in which case we can use this tag to distinct them. pub const TAG_DELTA_CHANGE: &str = "delta_change"; pub const TAG_INCREMENTAL_SCAN: &str = "incremental_scan"; @@ -108,6 +108,10 @@ lazy_static! { "The region which has minimal resolved ts" ) .unwrap(); + pub static ref CDC_MIN_RESOLVED_TS_LAG: IntGauge = register_int_gauge!( + "tikv_cdc_min_resolved_ts_lag", + "The lag between the minimal resolved ts and the current ts" + ).unwrap(); pub static ref CDC_MIN_RESOLVED_TS: IntGauge = register_int_gauge!( "tikv_cdc_min_resolved_ts", "The minimal resolved ts for current regions" @@ -201,6 +205,13 @@ lazy_static! { ) .unwrap(); + pub static ref CDC_RAW_OUTLIER_RESOLVED_TS_GAP: Histogram = register_histogram!( + "tikv_cdc_raw_outlier_resolved_ts_gap_seconds", + "Bucketed histogram of the gap between cdc raw outlier resolver_ts and current tso", + exponential_buckets(1.0, 2.0, 15).unwrap() // outlier threshold is 60s by default. + ) + .unwrap(); + pub static ref CDC_ROCKSDB_PERF_COUNTER_STATIC: PerfCounter = auto_flush_from!(CDC_ROCKSDB_PERF_COUNTER, PerfCounter); } diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index cf8503450c5..aac2842e404 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -27,7 +27,7 @@ pub struct CdcObserver { sched: Scheduler, // A shared registry for managing observed regions. // TODO: it may become a bottleneck, find a better way to manage the registry. - observe_regions: Arc>>, + observe_regions: Arc>>, } impl CdcObserver { @@ -43,8 +43,8 @@ impl CdcObserver { } pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - // use 0 as the priority of the cmd observer. CDC should have a higher priority than - // the `resolved-ts`'s cmd observer + // use 0 as the priority of the cmd observer. CDC should have a higher priority + // than the `resolved-ts`'s cmd observer coprocessor_host .registry .register_cmd_observer(0, BoxCmdObserver::new(self.clone())); @@ -59,8 +59,8 @@ impl CdcObserver { /// Subscribe an region, the observer will sink events of the region into /// its scheduler. /// - /// Return previous ObserveID if there is one. - pub fn subscribe_region(&self, region_id: u64, observe_id: ObserveID) -> Option { + /// Return previous ObserveId if there is one. + pub fn subscribe_region(&self, region_id: u64, observe_id: ObserveId) -> Option { self.observe_regions .write() .unwrap() @@ -70,9 +70,9 @@ impl CdcObserver { /// Stops observe the region. /// /// Return ObserverID if unsubscribe successfully. - pub fn unsubscribe_region(&self, region_id: u64, observe_id: ObserveID) -> Option { + pub fn unsubscribe_region(&self, region_id: u64, observe_id: ObserveId) -> Option { let mut regions = self.observe_regions.write().unwrap(); - // To avoid ABA problem, we must check the unique ObserveID. + // To avoid ABA problem, we must check the unique ObserveId. if let Some(oid) = regions.get(®ion_id) { if *oid == observe_id { return regions.remove(®ion_id); @@ -82,7 +82,7 @@ impl CdcObserver { } /// Check whether the region is subscribed or not. - pub fn is_subscribed(&self, region_id: u64) -> Option { + pub fn is_subscribed(&self, region_id: u64) -> Option { self.observe_regions .read() .unwrap() @@ -94,7 +94,8 @@ impl CdcObserver { impl Coprocessor for CdcObserver {} impl CmdObserver for CdcObserver { - // `CdcObserver::on_flush_applied_cmd_batch` should only invoke if `cmd_batches` is not empty + // `CdcObserver::on_flush_applied_cmd_batch` should only invoke if `cmd_batches` + // is not empty fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -103,6 +104,7 @@ impl CmdObserver for CdcObserver { ) { assert!(!cmd_batches.is_empty()); fail_point!("before_cdc_flush_apply"); + if max_level < ObserveLevel::All { return; } @@ -117,7 +119,8 @@ impl CmdObserver for CdcObserver { let mut region = Region::default(); region.mut_peers().push(Peer::default()); // Create a snapshot here for preventing the old value was GC-ed. - // TODO: only need it after enabling old value, may add a flag to indicate whether to get it. + // TODO: only need it after enabling old value, may add a flag to indicate + // whether to get it. let snapshot = RegionSnapshot::from_snapshot(Arc::new(engine.snapshot()), Arc::new(region)); let get_old_value = move |key, query_ts, @@ -198,8 +201,9 @@ mod tests { use engine_rocks::RocksEngine; use kvproto::metapb::Region; - use raftstore::{coprocessor::RoleChange, store::util::new_peer}; + use raftstore::coprocessor::RoleChange; use tikv::storage::kv::TestEngineBuilder; + use tikv_util::store::new_peer; use super::*; @@ -256,7 +260,7 @@ mod tests { observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); - let oid = ObserveID::new(); + let oid = ObserveId::new(); observer.subscribe_region(1, oid); let mut ctx = ObserverContext::new(®ion); @@ -268,6 +272,8 @@ mod tests { leader_id: 2, prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, + initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -295,6 +301,8 @@ mod tests { leader_id: raft::INVALID_ID, prev_lead_transferee: 3, vote: 3, + initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -319,7 +327,7 @@ mod tests { rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); // unsubscribed fail if observer id is different. - assert_eq!(observer.unsubscribe_region(1, ObserveID::new()), None); + assert_eq!(observer.unsubscribe_region(1, ObserveId::new()), None); // No event if it is unsubscribed. let oid_ = observer.unsubscribe_region(1, oid).unwrap(); diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index caf3060591e..d91266c92c2 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::Deref; +use std::ops::{Bound, Deref}; use engine_traits::{ReadOptions, CF_DEFAULT, CF_WRITE}; use getset::CopyGetters; @@ -104,8 +104,8 @@ impl OldValueCache { } } -/// Fetch old value for `key`. If it can't be found in `old_value_cache`, seek and retrieve it with -/// `query_ts` from `snapshot`. +/// Fetch old value for `key`. If it can't be found in `old_value_cache`, seek +/// and retrieve it with `query_ts` from `snapshot`. pub fn get_old_value( snapshot: &S, key: Key, @@ -171,9 +171,10 @@ pub fn new_old_value_cursor(snapshot: &S, cf: &'static str) - /// Gets the latest value to the key with an older or equal version. /// -/// The key passed in should be a key with a timestamp. This function will returns -/// the latest value of the entry if the user key is the same to the given key and -/// the timestamp is older than or equal to the timestamp in the given key. +/// The key passed in should be a key with a timestamp. This function will +/// returns the latest value of the entry if the user key is the same to the +/// given key and the timestamp is older than or equal to the timestamp in the +/// given key. /// /// `load_from_cf_data` indicates how to get value from `CF_DEFAULT`. pub fn near_seek_old_value( @@ -260,7 +261,7 @@ fn new_write_cursor_on_key(snapshot: &S, key: &Key) -> Cursor .range(Some(key.clone()), upper) // Use bloom filter to speed up seeking on a given prefix. .prefix_seek(true) - .hint_max_ts(Some(ts)) + .hint_max_ts(Some(Bound::Included(ts))) .build() .unwrap() } @@ -292,6 +293,7 @@ mod tests { use engine_rocks::{ReadPerfInstant, RocksEngine}; use engine_traits::{KvEngine, MiscExt}; + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; use tikv::{ config::DbConfig, storage::{kv::TestEngineBuilder, txn::tests::*}, @@ -339,8 +341,8 @@ mod tests { old_value_cache.cache.insert(key, value.clone()); } - assert_eq!(old_value_cache.cache.size(), size * cases as usize); - assert_eq!(old_value_cache.cache.len(), cases as usize); + assert_eq!(old_value_cache.cache.size(), size * cases); + assert_eq!(old_value_cache.cache.len(), cases); assert_eq!(old_value_cache.capacity(), capacity as usize); // Reduces capacity. @@ -358,7 +360,7 @@ mod tests { assert_eq!(old_value_cache.cache.size(), size * remaining_count); assert_eq!(old_value_cache.cache.len(), remaining_count); - assert_eq!(old_value_cache.capacity(), new_capacity as usize); + assert_eq!(old_value_cache.capacity(), new_capacity); for i in dropped_count..cases { let key = Key::from_raw(&i.to_be_bytes()); assert_eq!(old_value_cache.cache.get(&key).is_some(), true); @@ -379,120 +381,120 @@ mod tests { #[test] fn test_old_value_reader() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let kv_engine = engine.get_rocksdb(); let k = b"k"; let key = Key::from_raw(k); - must_prewrite_put(&engine, k, b"v1", k, 1); + must_prewrite_put(&mut engine, k, b"v1", k, 1); must_get_eq(&kv_engine, &key, 2, None); must_get_eq(&kv_engine, &key, 1, None); - must_commit(&engine, k, 1, 1); + must_commit(&mut engine, k, 1, 1); must_get_eq(&kv_engine, &key, 1, Some(b"v1".to_vec())); - must_prewrite_put(&engine, k, b"v2", k, 2); + must_prewrite_put(&mut engine, k, b"v2", k, 2); must_get_eq(&kv_engine, &key, 2, Some(b"v1".to_vec())); - must_rollback(&engine, k, 2, false); + must_rollback(&mut engine, k, 2, false); - must_prewrite_put(&engine, k, b"v3", k, 3); + must_prewrite_put(&mut engine, k, b"v3", k, 3); must_get_eq(&kv_engine, &key, 3, Some(b"v1".to_vec())); - must_commit(&engine, k, 3, 3); + must_commit(&mut engine, k, 3, 3); - must_prewrite_delete(&engine, k, k, 4); + must_prewrite_delete(&mut engine, k, k, 4); must_get_eq(&kv_engine, &key, 4, Some(b"v3".to_vec())); - must_commit(&engine, k, 4, 4); + must_commit(&mut engine, k, 4, 4); - must_prewrite_put(&engine, k, vec![b'v'; 5120].as_slice(), k, 5); + must_prewrite_put(&mut engine, k, vec![b'v'; 5120].as_slice(), k, 5); must_get_eq(&kv_engine, &key, 5, None); - must_commit(&engine, k, 5, 5); + must_commit(&mut engine, k, 5, 5); - must_prewrite_delete(&engine, k, k, 6); + must_prewrite_delete(&mut engine, k, k, 6); must_get_eq(&kv_engine, &key, 6, Some(vec![b'v'; 5120])); - must_rollback(&engine, k, 6, false); + must_rollback(&mut engine, k, 6, false); - must_prewrite_put(&engine, k, b"v4", k, 7); - must_commit(&engine, k, 7, 9); + must_prewrite_put(&mut engine, k, b"v4", k, 7); + must_commit(&mut engine, k, 7, 9); - must_acquire_pessimistic_lock(&engine, k, k, 8, 10); - must_pessimistic_prewrite_put(&engine, k, b"v5", k, 8, 10, true); + must_acquire_pessimistic_lock(&mut engine, k, k, 8, 10); + must_pessimistic_prewrite_put(&mut engine, k, b"v5", k, 8, 10, DoPessimisticCheck); must_get_eq(&kv_engine, &key, 10, Some(b"v4".to_vec())); - must_commit(&engine, k, 8, 11); + must_commit(&mut engine, k, 8, 11); } #[test] fn test_old_value_reader_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let kv_engine = engine.get_rocksdb(); // PUT, Read // `--------------^ - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 10); - must_commit(&engine, b"k1", 10, 20); - must_cleanup_with_gc_fence(&engine, b"k1", 20, 0, 50, true); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 20); + must_cleanup_with_gc_fence(&mut engine, b"k1", 20, 0, 50, true); // PUT, Read // `---------^ - must_prewrite_put(&engine, b"k2", b"v2", b"k2", 11); - must_commit(&engine, b"k2", 11, 20); - must_cleanup_with_gc_fence(&engine, b"k2", 20, 0, 40, true); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 11); + must_commit(&mut engine, b"k2", 11, 20); + must_cleanup_with_gc_fence(&mut engine, b"k2", 20, 0, 40, true); // PUT, Read // `-----^ - must_prewrite_put(&engine, b"k3", b"v3", b"k3", 12); - must_commit(&engine, b"k3", 12, 20); - must_cleanup_with_gc_fence(&engine, b"k3", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 12); + must_commit(&mut engine, b"k3", 12, 20); + must_cleanup_with_gc_fence(&mut engine, b"k3", 20, 0, 30, true); // PUT, PUT, Read // `-----^ `----^ - must_prewrite_put(&engine, b"k4", b"v4", b"k4", 13); - must_commit(&engine, b"k4", 13, 14); - must_prewrite_put(&engine, b"k4", b"v4x", b"k4", 15); - must_commit(&engine, b"k4", 15, 20); - must_cleanup_with_gc_fence(&engine, b"k4", 14, 0, 20, false); - must_cleanup_with_gc_fence(&engine, b"k4", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 13); + must_commit(&mut engine, b"k4", 13, 14); + must_prewrite_put(&mut engine, b"k4", b"v4x", b"k4", 15); + must_commit(&mut engine, b"k4", 15, 20); + must_cleanup_with_gc_fence(&mut engine, b"k4", 14, 0, 20, false); + must_cleanup_with_gc_fence(&mut engine, b"k4", 20, 0, 30, true); // PUT, DEL, Read // `-----^ `----^ - must_prewrite_put(&engine, b"k5", b"v5", b"k5", 13); - must_commit(&engine, b"k5", 13, 14); - must_prewrite_delete(&engine, b"k5", b"v5", 15); - must_commit(&engine, b"k5", 15, 20); - must_cleanup_with_gc_fence(&engine, b"k5", 14, 0, 20, false); - must_cleanup_with_gc_fence(&engine, b"k5", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k5", b"v5", b"k5", 13); + must_commit(&mut engine, b"k5", 13, 14); + must_prewrite_delete(&mut engine, b"k5", b"v5", 15); + must_commit(&mut engine, b"k5", 15, 20); + must_cleanup_with_gc_fence(&mut engine, b"k5", 14, 0, 20, false); + must_cleanup_with_gc_fence(&mut engine, b"k5", 20, 0, 30, true); // PUT, LOCK, LOCK, Read // `------------------------^ - must_prewrite_put(&engine, b"k6", b"v6", b"k6", 16); - must_commit(&engine, b"k6", 16, 20); - must_prewrite_lock(&engine, b"k6", b"k6", 25); - must_commit(&engine, b"k6", 25, 26); - must_prewrite_lock(&engine, b"k6", b"k6", 28); - must_commit(&engine, b"k6", 28, 29); - must_cleanup_with_gc_fence(&engine, b"k6", 20, 0, 50, true); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k6", 16); + must_commit(&mut engine, b"k6", 16, 20); + must_prewrite_lock(&mut engine, b"k6", b"k6", 25); + must_commit(&mut engine, b"k6", 25, 26); + must_prewrite_lock(&mut engine, b"k6", b"k6", 28); + must_commit(&mut engine, b"k6", 28, 29); + must_cleanup_with_gc_fence(&mut engine, b"k6", 20, 0, 50, true); // PUT, LOCK, LOCK, Read // `---------^ - must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); - must_commit(&engine, b"k7", 16, 20); - must_prewrite_lock(&engine, b"k7", b"k7", 25); - must_commit(&engine, b"k7", 25, 26); - must_cleanup_with_gc_fence(&engine, b"k7", 20, 0, 27, true); - must_prewrite_lock(&engine, b"k7", b"k7", 28); - must_commit(&engine, b"k7", 28, 29); + must_prewrite_put(&mut engine, b"k7", b"v7", b"k7", 16); + must_commit(&mut engine, b"k7", 16, 20); + must_prewrite_lock(&mut engine, b"k7", b"k7", 25); + must_commit(&mut engine, b"k7", 25, 26); + must_cleanup_with_gc_fence(&mut engine, b"k7", 20, 0, 27, true); + must_prewrite_lock(&mut engine, b"k7", b"k7", 28); + must_commit(&mut engine, b"k7", 28, 29); // PUT, Read // * (GC fence ts is 0) - must_prewrite_put(&engine, b"k8", b"v8", b"k8", 17); - must_commit(&engine, b"k8", 17, 30); - must_cleanup_with_gc_fence(&engine, b"k8", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k8", b"v8", b"k8", 17); + must_commit(&mut engine, b"k8", 17, 30); + must_cleanup_with_gc_fence(&mut engine, b"k8", 30, 0, 0, true); // PUT, LOCK, Read // `-----------^ - must_prewrite_put(&engine, b"k9", b"v9", b"k9", 18); - must_commit(&engine, b"k9", 18, 20); - must_prewrite_lock(&engine, b"k9", b"k9", 25); - must_commit(&engine, b"k9", 25, 26); - must_cleanup_with_gc_fence(&engine, b"k9", 20, 0, 27, true); + must_prewrite_put(&mut engine, b"k9", b"v9", b"k9", 18); + must_commit(&mut engine, b"k9", 18, 20); + must_prewrite_lock(&mut engine, b"k9", b"k9", 25); + must_commit(&mut engine, b"k9", 25, 26); + must_cleanup_with_gc_fence(&mut engine, b"k9", 20, 0, 27, true); let expected_results = vec![ (b"k1", Some(b"v1")), @@ -513,16 +515,16 @@ mod tests { #[test] fn test_old_value_reuse_cursor() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let kv_engine = engine.get_rocksdb(); let value = || vec![b'v'; 1024]; for i in 0..100 { let key = format!("key-{:0>3}", i).into_bytes(); - must_prewrite_put(&engine, &key, &value(), &key, 100); - must_commit(&engine, &key, 100, 101); - must_prewrite_put(&engine, &key, &value(), &key, 200); - must_commit(&engine, &key, 200, 201); + must_prewrite_put(&mut engine, &key, &value(), &key, 100); + must_commit(&mut engine, &key, 100, 101); + must_prewrite_put(&mut engine, &key, &value(), &key, 200); + must_commit(&mut engine, &key, 200, 201); } let snapshot = Arc::new(kv_engine.snapshot()); @@ -584,14 +586,14 @@ mod tests { let mut cfg = DbConfig::default(); cfg.writecf.disable_auto_compactions = true; cfg.writecf.pin_l0_filter_and_index_blocks = false; - let engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); + let mut engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); let kv_engine = engine.get_rocksdb(); // Key must start with `z` to pass `TsFilter`'s check. for i in 0..4 { let key = format!("zkey-{:0>3}", i).into_bytes(); - must_prewrite_put(&engine, &key, b"value", &key, 100); - must_commit(&engine, &key, 100, 101); + must_prewrite_put(&mut engine, &key, b"value", &key, 100); + must_commit(&mut engine, &key, 100, 101); kv_engine.flush_cf(CF_WRITE, true).unwrap(); } diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index 80d0f8c47a4..215f2cdebca 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -26,7 +26,7 @@ use tikv_util::{error, info, warn, worker::*}; use crate::{ channel::{channel, MemoryQuota, Sink, CDC_CHANNLE_CAPACITY}, - delegate::{Downstream, DownstreamID, DownstreamState}, + delegate::{Downstream, DownstreamId, DownstreamState, ObservedRange}, endpoint::{Deregister, Task}, }; @@ -34,15 +34,15 @@ static CONNECTION_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier of a Connection. #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] -pub struct ConnID(usize); +pub struct ConnId(usize); -impl ConnID { - pub fn new() -> ConnID { - ConnID(CONNECTION_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) +impl ConnId { + pub fn new() -> ConnId { + ConnId(CONNECTION_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) } } -impl Default for ConnID { +impl Default for ConnId { fn default() -> Self { Self::new() } @@ -74,10 +74,10 @@ impl FeatureGate { } pub struct Conn { - id: ConnID, + id: ConnId, sink: Sink, - // region id -> DownstreamID - downstreams: HashMap>)>, + // region id -> DownstreamId + downstreams: HashMap>)>, peer: String, version: Option<(semver::Version, FeatureGate)>, } @@ -85,7 +85,7 @@ pub struct Conn { impl Conn { pub fn new(sink: Sink, peer: String) -> Conn { Conn { - id: ConnID::new(), + id: ConnId::new(), sink, downstreams: HashMap::default(), version: None, @@ -132,19 +132,19 @@ impl Conn { &self.peer } - pub fn get_id(&self) -> ConnID { + pub fn get_id(&self) -> ConnId { self.id } pub fn get_downstreams( &self, - ) -> &HashMap>)> { + ) -> &HashMap>)> { &self.downstreams } pub fn take_downstreams( self, - ) -> HashMap>)> { + ) -> HashMap>)> { self.downstreams } @@ -155,7 +155,7 @@ impl Conn { pub fn subscribe( &mut self, region_id: u64, - downstream_id: DownstreamID, + downstream_id: DownstreamId, downstream_state: Arc>, ) -> bool { match self.downstreams.entry(region_id) { @@ -171,7 +171,7 @@ impl Conn { self.downstreams.remove(®ion_id); } - pub fn downstream_id(&self, region_id: u64) -> Option { + pub fn downstream_id(&self, region_id: u64) -> Option { self.downstreams.get(®ion_id).map(|x| x.0) } } @@ -207,7 +207,7 @@ impl ChangeData for Service { let (event_sink, mut event_drain) = channel(CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); let peer = ctx.peer(); - let conn = Conn::new(event_sink, peer); + let conn = Conn::new(event_sink, peer.clone()); let conn_id = conn.get_id(); if let Err(status) = self @@ -217,11 +217,12 @@ impl ChangeData for Service { RpcStatus::with_message(RpcStatusCode::INVALID_ARGUMENT, format!("{:?}", e)) }) { - error!("cdc connection initiate failed"; "error" => ?status); - ctx.spawn( - sink.fail(status) - .unwrap_or_else(|e| error!("cdc failed to send error"; "error" => ?e)), - ); + error!("cdc connection initiate failed"; + "downstream" => ?peer, "error" => ?status); + ctx.spawn(sink.fail(status).unwrap_or_else(move |e| { + error!("cdc failed to send error"; + "downstream" => ?peer, "error" => ?e) + })); return; } @@ -236,12 +237,29 @@ impl ChangeData for Service { Err(e) => { warn!("empty or invalid TiCDC version, please upgrading TiCDC"; "version" => request.get_header().get_ticdc_version(), + "downstream" => ?peer, "error" => ?e); semver::Version::new(0, 0, 0) } }; - let downstream = - Downstream::new(peer.clone(), region_epoch, req_id, conn_id, req_kvapi); + let observed_range = + match ObservedRange::new(request.start_key.clone(), request.end_key.clone()) { + Ok(observed_range) => observed_range, + Err(e) => { + warn!("cdc invalid observed start key or end key version"; + "downstream" => ?peer, "error" => ?e); + ObservedRange::default() + } + }; + let downstream = Downstream::new( + peer.clone(), + region_epoch, + req_id, + conn_id, + req_kvapi, + request.filter_loop, + observed_range, + ); let ret = scheduler .schedule(Task::Register { request, diff --git a/components/cdc/src/txn_source.rs b/components/cdc/src/txn_source.rs new file mode 100644 index 00000000000..81dc9f95096 --- /dev/null +++ b/components/cdc/src/txn_source.rs @@ -0,0 +1,116 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +// The bitmap: +// |RESERVED|LOSSY_DDL_REORG_SOURCE_BITS|CDC_WRITE_SOURCE_BITS| +// | 48 | 8 | 4(RESERVED) | 4 | +// +// TiCDC uses 1 - 255 to indicate the source of TiDB. +// For now, 1 - 15 are reserved for TiCDC to implement BDR synchronization. +// 16 - 255 are reserved for extendability. +const CDC_WRITE_SOURCE_BITS: u64 = 8; +const CDC_WRITE_SOURCE_MAX: u64 = (1 << CDC_WRITE_SOURCE_BITS) - 1; + +// TiCDC uses 1-255 to indicate the change from a lossy DDL reorg Backfill job. +// For now, we only use 1 for column reorg backfill job. +#[cfg(test)] +const LOSSY_DDL_REORG_SOURCE_BITS: u64 = 8; +#[cfg(test)] +const LOSSY_DDL_COLUMN_REORG_SOURCE: u64 = 1; +#[cfg(test)] +const LOSSY_DDL_REORG_SOURCE_MAX: u64 = (1 << LOSSY_DDL_REORG_SOURCE_BITS) - 1; +const LOSSY_DDL_REORG_SOURCE_SHIFT: u64 = CDC_WRITE_SOURCE_BITS; + +/// For kv.TxnSource +/// We use an uint64 to represent the source of a transaction. +/// The first 8 bits are reserved for TiCDC, and the next 8 bits are reserved +/// for Lossy DDL reorg Backfill job. The remaining 48 bits are reserved for +/// extendability. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +pub(crate) struct TxnSource(u64); + +impl TxnSource { + #[cfg(test)] + pub(crate) fn set_cdc_write_source(&mut self, value: u64) { + if value > CDC_WRITE_SOURCE_MAX { + unreachable!("Only use it in tests") + } + self.0 |= value; + } + + #[cfg(test)] + pub(crate) fn get_cdc_write_source(&self) -> u64 { + self.0 & CDC_WRITE_SOURCE_MAX + } + + pub(crate) fn is_cdc_write_source_set(txn_source: u64) -> bool { + (txn_source & CDC_WRITE_SOURCE_MAX) != 0 + } + + #[cfg(test)] + pub(crate) fn set_lossy_ddl_reorg_source(&mut self, value: u64) { + if value > LOSSY_DDL_REORG_SOURCE_MAX { + unreachable!("Only use it in tests") + } + self.0 |= value << LOSSY_DDL_REORG_SOURCE_SHIFT; + } + + #[cfg(test)] + pub(crate) fn get_lossy_ddl_reorg_source(&self) -> u64 { + (self.0 >> LOSSY_DDL_REORG_SOURCE_SHIFT) & LOSSY_DDL_REORG_SOURCE_MAX + } + + pub(crate) fn is_lossy_ddl_reorg_source_set(txn_source: u64) -> bool { + (txn_source >> LOSSY_DDL_REORG_SOURCE_SHIFT) != 0 + } +} + +impl From for u64 { + fn from(val: TxnSource) -> Self { + val.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_cdc_write_source() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + assert_eq!(txn_source.get_cdc_write_source(), 1); + } + + #[test] + fn test_is_cdc_write_source_set() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + assert_eq!(TxnSource::is_cdc_write_source_set(txn_source.0), true); + + let txn_source = TxnSource::default(); + assert_eq!(TxnSource::is_cdc_write_source_set(txn_source.0), false); + } + + #[test] + fn test_get_lossy_ddl_reorg_source() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(LOSSY_DDL_COLUMN_REORG_SOURCE); + assert_eq!( + txn_source.get_lossy_ddl_reorg_source(), + LOSSY_DDL_COLUMN_REORG_SOURCE + ); + } + + #[test] + fn test_is_lossy_ddl_reorg_source_set() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(LOSSY_DDL_COLUMN_REORG_SOURCE); + assert_eq!(TxnSource::is_lossy_ddl_reorg_source_set(txn_source.0), true); + + let txn_source = TxnSource::default(); + assert_eq!( + TxnSource::is_lossy_ddl_reorg_source_set(txn_source.0), + false + ); + } +} diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index a38c3988bcc..3fdd6048971 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -1,15 +1,21 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::mpsc, thread, time::Duration}; +use std::{ + sync::{mpsc, Arc}, + thread, + time::Duration, +}; use api_version::{test_kv_format_impl, KvFormat}; -use cdc::{recv_timeout, OldValueCache, Task, Validate}; +use causal_ts::CausalTsProvider; +use cdc::{recv_timeout, Delegate, OldValueCache, Task, Validate}; use futures::{executor::block_on, sink::SinkExt}; -use grpcio::WriteFlags; -use kvproto::{cdcpb::*, kvrpcpb::*}; +use grpcio::{ChannelBuilder, Environment, WriteFlags}; +use kvproto::{cdcpb::*, kvrpcpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; use test_raftstore::*; -use tikv_util::{debug, worker::Scheduler}; +use tikv_util::{debug, worker::Scheduler, HandyRwLock}; +use txn_types::TimeStamp; use crate::{new_event_feed, ClientReceiver, TestSuite, TestSuiteBuilder}; @@ -52,6 +58,12 @@ fn test_cdc_double_scan_deregister_impl() { new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx_1.send((req, WriteFlags::default()))).unwrap(); + // wait for the second connection register to the delegate. + suite.must_wait_delegate_condition( + 1, + Arc::new(|d: Option<&Delegate>| d.unwrap().downstreams().len() == 2), + ); + // close connection block_on(req_tx.close()).unwrap(); event_feed_wrap.replace(None); @@ -306,11 +318,12 @@ fn do_test_no_resolved_ts_before_downstream_initialized(version: &str) { } let th = thread::spawn(move || { - // The first downstream can receive timestamps but the second should receive nothing. + // The first downstream can receive timestamps but the second should receive + // nothing. let mut rx = event_feeds[0].replace(None).unwrap(); - assert!(recv_timeout(&mut rx, Duration::from_secs(1)).is_ok()); + recv_timeout(&mut rx, Duration::from_secs(1)).unwrap(); let mut rx = event_feeds[1].replace(None).unwrap(); - assert!(recv_timeout(&mut rx, Duration::from_secs(3)).is_err()); + recv_timeout(&mut rx, Duration::from_secs(3)).unwrap_err(); }); th.join().unwrap(); @@ -318,11 +331,11 @@ fn do_test_no_resolved_ts_before_downstream_initialized(version: &str) { suite.stop(); } -// When a new CDC downstream is installed, delta changes for other downstreams on the same -// region should be flushed so that the new downstream can gets a fresh snapshot to performs -// a incremental scan. CDC can ensure that those delta changes are sent to CDC's `Endpoint` -// before the incremental scan, but `Sink` may break this rule. This case tests it won't -// happen any more. +// When a new CDC downstream is installed, delta changes for other downstreams +// on the same region should be flushed so that the new downstream can gets a +// fresh snapshot to performs a incremental scan. CDC can ensure that those +// delta changes are sent to CDC's `Endpoint` before the incremental scan, but +// `Sink` may break this rule. This case tests it won't happen any more. #[test] fn test_cdc_observed_before_incremental_scan_snapshot() { let cluster = new_server_cluster(0, 1); @@ -331,7 +344,8 @@ fn test_cdc_observed_before_incremental_scan_snapshot() { let region = suite.cluster.get_region(b""); let lead_client = PeerClient::new(&suite.cluster, region.id, new_peer(1, 1)); - // So that the second changefeed can get some delta changes elder than its snapshot. + // So that the second changefeed can get some delta changes elder than its + // snapshot. let (mut req_tx_0, event_feed_0, _) = new_event_feed(suite.get_region_cdc_client(region.id)); let req_0 = suite.new_changedata_request(region.id); block_on(req_tx_0.send((req_0, WriteFlags::default()))).unwrap(); @@ -435,3 +449,79 @@ fn test_old_value_cache_without_downstreams() { fail::remove("cdc_flush_old_value_metrics"); } + +#[test] +fn test_cdc_rawkv_resolved_ts() { + let mut suite = TestSuite::new(1, ApiVersion::V2); + let cluster = &suite.cluster; + + let region = cluster.get_region(b""); + let region_id = region.get_id(); + let leader = region.get_peers()[0].clone(); + let node_id = leader.get_id(); + let ts_provider = cluster.sim.rl().get_causal_ts_provider(node_id).unwrap(); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let mut req = suite.new_changedata_request(region_id); + req.set_kv_api(ChangeDataRequestKvApi::RawKv); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(region_id)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + let event = receive_event(false); + event + .events + .into_iter() + .for_each(|e| match e.event.unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + }); + // Sleep a while to make sure the stream is registered. + sleep_ms(1000); + + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(leader); + ctx.set_api_version(ApiVersion::V2); + let mut put_req = RawPutRequest::default(); + put_req.set_context(ctx); + put_req.key = b"rk3".to_vec(); + put_req.value = b"v3".to_vec(); + + let pause_write_fp = "raftkv_async_write"; + fail::cfg(pause_write_fp, "pause").unwrap(); + let ts = block_on(ts_provider.async_get_ts()).unwrap(); + let handle = thread::spawn(move || { + let _ = client.raw_put(&put_req).unwrap(); + }); + + sleep_ms(100); + + let event = receive_event(true).resolved_ts.unwrap(); + assert!( + ts.next() >= TimeStamp::from(event.ts), + "{} {}", + ts, + TimeStamp::from(event.ts) + ); + // Receive again to make sure resolved ts <= ongoing request's ts. + let event = receive_event(true).resolved_ts.unwrap(); + assert!( + ts.next() >= TimeStamp::from(event.ts), + "{} {}", + ts, + TimeStamp::from(event.ts) + ); + + fail::remove(pause_write_fp); + handle.join().unwrap(); +} diff --git a/components/cdc/tests/failpoints/test_observe.rs b/components/cdc/tests/failpoints/test_observe.rs index 8c418558dcc..480fcc4582f 100644 --- a/components/cdc/tests/failpoints/test_observe.rs +++ b/components/cdc/tests/failpoints/test_observe.rs @@ -130,7 +130,7 @@ fn test_observe_duplicate_cmd_impl() { #[allow(dead_code)] fn test_delayed_change_cmd() { let mut cluster = new_server_cluster(1, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(20)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(100); cluster.pd_client.disable_default_operator(); let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); diff --git a/components/cdc/tests/failpoints/test_resolve.rs b/components/cdc/tests/failpoints/test_resolve.rs index 75326ac0fb5..560eb68ba44 100644 --- a/components/cdc/tests/failpoints/test_resolve.rs +++ b/components/cdc/tests/failpoints/test_resolve.rs @@ -260,7 +260,7 @@ fn test_joint_confchange() { receive_resolved_ts(&receive_event); tx.send(()).unwrap(); }); - assert!(rx.recv_timeout(Duration::from_secs(2)).is_err()); + rx.recv_timeout(Duration::from_secs(2)).unwrap_err(); fail::remove(update_region_fp); fail::remove(deregister_fp); diff --git a/components/cdc/tests/integrations/mod.rs b/components/cdc/tests/integrations/mod.rs index 821e4ad186e..c60a1fe8cb9 100644 --- a/components/cdc/tests/integrations/mod.rs +++ b/components/cdc/tests/integrations/mod.rs @@ -1,5 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(assert_matches)] + mod test_cdc; mod test_flow_control; diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 06b16de1f20..3e5345e51f8 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -12,7 +12,7 @@ use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; use tikv::server::DEFAULT_CLUSTER_ID; -use tikv_util::HandyRwLock; +use tikv_util::{config::ReadableDuration, HandyRwLock}; use txn_types::{Key, Lock, LockType}; use crate::{new_event_feed, TestSuite, TestSuiteBuilder}; @@ -613,16 +613,15 @@ fn test_cdc_scan_impl() { fn test_cdc_rawkv_scan() { let mut suite = TestSuite::new(3, ApiVersion::V2); - suite.set_tso(10); - suite.flush_causal_timestamp_for_region(1); let (k1, v1) = (b"rkey1".to_vec(), b"value1".to_vec()); suite.must_kv_put(1, k1, v1); let (k2, v2) = (b"rkey2".to_vec(), b"value2".to_vec()); suite.must_kv_put(1, k2, v2); - suite.set_tso(1000); + let ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.flush_causal_timestamp_for_region(1); + let (k3, v3) = (b"rkey3".to_vec(), b"value3".to_vec()); suite.must_kv_put(1, k3.clone(), v3.clone()); @@ -631,7 +630,7 @@ fn test_cdc_rawkv_scan() { let mut req = suite.new_changedata_request(1); req.set_kv_api(ChangeDataRequestKvApi::RawKv); - req.set_checkpoint_ts(999); + req.set_checkpoint_ts(ts.into_inner()); let (mut req_tx, event_feed_wrap, receive_event) = new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); @@ -935,7 +934,7 @@ fn test_cdc_batch_size_limit_impl() { assert_eq!(events.len(), 1, "{:?}", events); match events.pop().unwrap().event.unwrap() { Event_oneof_event::Entries(es) => { - assert!(es.entries.len() == 2); + assert_eq!(es.entries.len(), 2); let e = &es.entries[0]; assert_eq!(e.get_type(), EventLogType::Prewrite, "{:?}", e.get_type()); assert_eq!(e.key, b"xk3", "{:?}", e.key); @@ -1177,7 +1176,8 @@ fn test_old_value_multi_changefeeds_impl() { } } - // The downstream 2 can also get old values because `req`.`extra_op` field is ignored now. + // The downstream 2 can also get old values because `req`.`extra_op` field is + // ignored now. event_count = 0; loop { let events = receive_event_2(false).events.to_vec(); @@ -1285,9 +1285,9 @@ fn test_cdc_resolve_ts_checking_concurrency_manager_impl() { } let _guard = lock_key(b"xa", 90); - // The resolved_ts should be blocked by the mem lock but it's already greater than 90. - // Retry until receiving an unchanged resolved_ts because the first several resolved ts received - // might be updated before acquiring the lock. + // The resolved_ts should be blocked by the mem lock but it's already greater + // than 90. Retry until receiving an unchanged resolved_ts because the first + // several resolved ts received might be updated before acquiring the lock. let mut last_resolved_ts = 0; let mut success = false; for _ in 0..5 { @@ -1840,9 +1840,10 @@ fn test_cdc_scan_ignore_gc_fence_impl() { let commit_ts2 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_kv_commit(1, vec![key.to_vec()], start_ts2, commit_ts2); - // Assume the first version above is written by async commit and it's commit_ts is not unique. - // Use it's commit_ts as another transaction's start_ts. - // Run check_txn_status on commit_ts1 so that gc_fence will be set on the first version. + // Assume the first version above is written by async commit and it's commit_ts + // is not unique. Use it's commit_ts as another transaction's start_ts. + // Run check_txn_status on commit_ts1 so that gc_fence will be set on the first + // version. let caller_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); let action = suite.must_check_txn_status( 1, @@ -1940,9 +1941,10 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { let commit_ts2 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_kv_commit(1, vec![key.to_vec()], start_ts2, commit_ts2); - // We don't care about the events caused by the previous writings in this test case, and it's - // too complicated to check them. Just skip them here, and wait for resolved_ts to be pushed to - // a greater value than the two versions' commit_ts-es. + // We don't care about the events caused by the previous writings in this test + // case, and it's too complicated to check them. Just skip them here, and + // wait for resolved_ts to be pushed to a greater value than the two + // versions' commit_ts-es. let skip_to_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); loop { let e = receive_event(true); @@ -1953,9 +1955,10 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { } } - // Assume the two versions of the key are written by async commit transactions, and their - // commit_ts-es are also other transaction's start_ts-es. Run check_txn_status on the - // commit_ts-es of the two versions to cause overlapping rollback. + // Assume the two versions of the key are written by async commit transactions, + // and their commit_ts-es are also other transaction's start_ts-es. Run + // check_txn_status on the commit_ts-es of the two versions to cause + // overlapping rollback. let caller_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_check_txn_status( 1, @@ -2007,9 +2010,9 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { other => panic!("unknown event {:?}", other), }); - // In some special cases, a newly committed record may carry an overlapped rollback initially. - // In this case, gc_fence shouldn't be set, and CDC ignores the rollback and handles the - // committing normally. + // In some special cases, a newly committed record may carry an overlapped + // rollback initially. In this case, gc_fence shouldn't be set, and CDC + // ignores the rollback and handles the committing normally. let start_ts3 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); let mut mutation = Mutation::default(); mutation.set_op(Op::Put); @@ -2031,11 +2034,11 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { other => panic!("unknown event {:?}", other), }); - // Again, assume the transaction is committed with async commit protocol, and the commit_ts is - // also another transaction's start_ts. + // Again, assume the transaction is committed with async commit protocol, and + // the commit_ts is also another transaction's start_ts. let commit_ts3 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); - // Rollback another transaction before committing, then the rolling back information will be - // recorded in the lock. + // Rollback another transaction before committing, then the rolling back + // information will be recorded in the lock. let caller_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_check_txn_status( 1, @@ -2082,10 +2085,11 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { suite.stop(); } -// This test is created for covering the case that term was increased without leader change. -// Ideally leader id and term in StoreMeta should be updated together with a yielded SoftState, -// but sometimes the leader was transferred to another store and then changed back, -// a follower would not get a new SoftState. +// This test is created for covering the case that term was increased without +// leader change. Ideally leader id and term in StoreMeta should be updated +// together with a yielded SoftState, but sometimes the leader was transferred +// to another store and then changed back, a follower would not get a new +// SoftState. #[test] fn test_term_change() { let cluster = new_server_cluster(0, 3); @@ -2318,3 +2322,411 @@ fn test_resolved_ts_with_learners() { } panic!("resolved timestamp should be advanced correctly"); } + +#[test] +fn test_prewrite_without_value() { + let cluster = new_server_cluster(0, 2); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let rid = suite.cluster.get_region(&[]).id; + let ctx = suite.get_context(rid); + let client = suite.get_tikv_client(rid).clone(); + let large_value = vec![b'x'; 2 * txn_types::SHORT_VALUE_MAX_LEN]; + + // Perform a pessimistic prewrite with a large value. + let mut muts = vec![Mutation::default()]; + muts[0].set_op(Op::Put); + muts[0].key = b"key".to_vec(); + muts[0].value = large_value.clone(); + try_kv_prewrite_pessimistic(&client, ctx.clone(), muts, b"key".to_vec(), 10); + + let req = suite.new_changedata_request(rid); + let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(rid)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + // The prewrite can be retrieved from incremental scan. + let event = receive_event(false); + assert_eq!( + event.get_events()[0].get_entries().entries[0].value, + large_value + ); + + // check_txn_status will put the lock again, but without value. + must_check_txn_status(&client, ctx.clone(), b"key", 10, 12, 12); + must_kv_commit(&client, ctx, vec![b"key".to_vec()], 10, 14, 14); + // The lock without value shouldn't be retrieved. + let event = receive_event(false); + assert_eq!(event.get_events()[0].get_entries().entries[0].commit_ts, 14); +} + +#[test] +fn test_filter_loop() { + test_kv_format_impl!(test_filter_loop_impl); +} + +fn test_filter_loop_impl() { + let mut suite = TestSuite::new(1, F::TAG); + let mut req = suite.new_changedata_request(1); + req.set_extra_op(ExtraOp::ReadOldValue); + req.set_filter_loop(true); + let (mut req_tx, event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let row = &es.take_entries().to_vec()[0]; + assert_eq!(row.get_type(), EventLogType::Initialized); + } + other => panic!("unknown event {:?}", other), + } + + // Insert value, simulate INSERT INTO. + let mut m1 = Mutation::default(); + let k1 = b"xk1".to_vec(); + m1.set_op(Op::Insert); + m1.key = k1.clone(); + m1.value = b"v1".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m1], k1.clone(), 10.into(), 1); + let mut m2 = Mutation::default(); + let k2 = b"xk2".to_vec(); + m2.set_op(Op::Insert); + m2.key = k2.clone(); + m2.value = b"v2".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m2], k2.clone(), 12.into(), 0); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_value(), b"v2"); + assert_eq!(row.get_old_value(), b""); + assert_eq!(row.get_type(), EventLogType::Prewrite); + assert_eq!(row.get_start_ts(), 12); + } + other => panic!("unknown event {:?}", other), + } + suite.must_kv_commit_with_source(1, vec![k1], 10.into(), 15.into(), 1); + suite.must_kv_commit_with_source(1, vec![k2], 12.into(), 17.into(), 0); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_type(), EventLogType::Commit); + assert_eq!(row.get_commit_ts(), 17); + } + other => panic!("unknown event {:?}", other), + } + + // Rollback + let mut m3 = Mutation::default(); + let k3 = b"xk3".to_vec(); + m3.set_op(Op::Put); + m3.key = k3.clone(); + m3.value = b"v3".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m3], k3.clone(), 30.into(), 1); + suite.must_kv_rollback(1, vec![k3], 30.into()); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_type(), EventLogType::Rollback); + assert_eq!(row.get_commit_ts(), 0); + } + other => panic!("unknown event {:?}", other), + } + + // Update value + let k1 = b"xk1".to_vec(); + let mut m4 = Mutation::default(); + m4.set_op(Op::Put); + m4.key = k1.clone(); + m4.value = vec![b'3'; 5120]; + suite.must_kv_prewrite_with_source(1, vec![m4], k1.clone(), 40.into(), 1); + suite.must_kv_commit_with_source(1, vec![k1], 40.into(), 42.into(), 1); + let k2 = b"xk2".to_vec(); + let mut m5 = Mutation::default(); + m5.set_op(Op::Put); + m5.key = k2.clone(); + m5.value = vec![b'4'; 5121]; + suite.must_kv_prewrite(1, vec![m5], k2.clone(), 44.into()); + suite.must_kv_commit(1, vec![k2.clone()], 44.into(), 46.into()); + let mut events = receive_event(false).events.to_vec(); + if events.len() == 1 { + events.extend(receive_event(false).events.into_iter()); + } + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + assert_eq!(events[0].get_type(), EventLogType::Prewrite); + assert_eq!(events[0].get_start_ts(), 44); + assert_eq!(events[0].get_key(), k2.as_slice()); + } + other => panic!("unknown event {:?}", other), + } + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + assert_eq!(events[0].get_type(), EventLogType::Commit); + assert_eq!(events[0].get_commit_ts(), 46); + assert_eq!(events[0].get_key(), k2.as_slice()); + } + other => panic!("unknown event {:?}", other), + } + + event_feed_wrap.replace(None); + suite.stop(); +} + +#[test] +fn test_flashback() { + let mut cluster = new_server_cluster(0, 1); + cluster.cfg.resolved_ts.advance_ts_interval = ReadableDuration::millis(50); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + + let key = Key::from_raw(b"a"); + let region = suite.cluster.get_region(key.as_encoded()); + let region_id = region.get_id(); + let req = suite.new_changedata_request(region_id); + let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(region_id)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + // Sleep a while to make sure the stream is registered. + sleep_ms(1000); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + for i in 0..2 { + let (k, v) = ( + format!("key{}", i).as_bytes().to_vec(), + format!("value{}", i).as_bytes().to_vec(), + ); + // Prewrite + let start_ts1 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k.clone(), start_ts1); + // Commit + let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(1, vec![k.clone()], start_ts1, commit_ts); + } + let (start_key, end_key) = (b"key0".to_vec(), b"key2".to_vec()); + // Prepare flashback. + let flashback_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_prepare_flashback(region_id, &start_key, &end_key, flashback_start_ts); + // resolved ts should not be advanced anymore. + let mut counter = 0; + let mut last_resolved_ts = 0; + loop { + let event = receive_event(true); + if let Some(resolved_ts) = event.resolved_ts.as_ref() { + if resolved_ts.ts == last_resolved_ts { + counter += 1; + } + last_resolved_ts = resolved_ts.ts; + } + if counter > 20 { + break; + } + sleep_ms(50); + } + // Flashback. + let flashback_commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_flashback( + region_id, + &start_key, + &end_key, + flashback_start_ts, + flashback_commit_ts, + start_ts, + ); + // Check the flashback event. + let mut resolved_ts = 0; + let mut event_counter = 0; + loop { + let mut cde = receive_event(true); + if cde.get_resolved_ts().get_ts() > resolved_ts { + resolved_ts = cde.get_resolved_ts().get_ts(); + } + let events = cde.mut_events(); + if !events.is_empty() { + assert_eq!(events.len(), 1); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + event_counter += 1; + let e = &entries.entries[0]; + assert!(e.commit_ts > resolved_ts); + assert_eq!(e.get_op_type(), EventRowOpType::Delete); + match e.get_type() { + EventLogType::Committed => { + // First entry should be a 1PC flashback. + assert_eq!(e.get_key(), b"key1"); + assert_eq!(event_counter, 1); + } + EventLogType::Commit => { + // Second entry should be a 2PC commit. + assert_eq!(e.get_key(), b"key0"); + assert_eq!(event_counter, 2); + break; + } + _ => panic!("unknown event type {:?}", e.get_type()), + } + } + other => panic!("unknown event {:?}", other), + } + } + } +} + +#[test] +fn test_cdc_filter_key_range() { + let mut suite = TestSuite::new(1, ApiVersion::V1); + + let req = suite.new_changedata_request(1); + + // Observe range [key1, key3). + let mut req_1_3 = req.clone(); + req_1_3.request_id = 13; + req_1_3.start_key = Key::from_raw(b"key1").into_encoded(); + req_1_3.end_key = Key::from_raw(b"key3").into_encoded(); + let (mut req_tx13, _event_feed_wrap13, receive_event13) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx13.send((req_1_3, WriteFlags::default()))).unwrap(); + let event = receive_event13(false); + event + .events + .into_iter() + .for_each(|e| match e.event.unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + }); + + let (mut req_tx24, _event_feed_wrap24, receive_event24) = + new_event_feed(suite.get_region_cdc_client(1)); + let mut req_2_4 = req; + req_2_4.request_id = 24; + req_2_4.start_key = Key::from_raw(b"key2").into_encoded(); + req_2_4.end_key = Key::from_raw(b"key4").into_encoded(); + block_on(req_tx24.send((req_2_4, WriteFlags::default()))).unwrap(); + let event = receive_event24(false); + event + .events + .into_iter() + .for_each(|e| match e.event.unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + }); + + // Sleep a while to make sure the stream is registered. + sleep_ms(1000); + + let receive_and_check_events = |is13: bool, is24: bool| -> Vec { + if is13 && is24 { + let mut events = receive_event13(false).events.to_vec(); + let mut events24 = receive_event24(false).events.to_vec(); + events.append(&mut events24); + events + } else if is13 { + let events = receive_event13(false).events.to_vec(); + let event = receive_event24(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + events + } else if is24 { + let events = receive_event24(false).events.to_vec(); + let event = receive_event13(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + events + } else { + let event = receive_event13(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + let event = receive_event24(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + vec![] + } + }; + for case in &[ + ("key1", true, false, true /* commit */), + ("key1", true, false, false /* rollback */), + ("key2", true, true, true), + ("key3", false, true, true), + ("key4", false, false, true), + ] { + let (k, v) = (case.0.to_owned(), "value".to_owned()); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone().into_bytes(); + mutation.value = v.into_bytes(); + suite.must_kv_prewrite(1, vec![mutation], k.clone().into_bytes(), start_ts); + let mut events = receive_and_check_events(case.1, case.2); + while let Some(event) = events.pop() { + match event.event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Prewrite); + } + other => panic!("unknown event {:?}", other), + } + } + + if case.3 { + // Commit + let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(1, vec![k.into_bytes()], start_ts, commit_ts); + let mut events = receive_and_check_events(case.1, case.2); + while let Some(event) = events.pop() { + match event.event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Commit); + } + other => panic!("unknown event {:?}", other), + } + } + } else { + // Rollback + suite.must_kv_rollback(1, vec![k.into_bytes()], start_ts); + let mut events = receive_and_check_events(case.1, case.2); + while let Some(event) = events.pop() { + match event.event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Rollback); + } + other => panic!("unknown event {:?}", other), + } + } + } + } + + suite.stop(); +} diff --git a/components/cdc/tests/integrations/test_flow_control.rs b/components/cdc/tests/integrations/test_flow_control.rs index 56cb43e06c4..fdfd136d9c7 100644 --- a/components/cdc/tests/integrations/test_flow_control.rs +++ b/components/cdc/tests/integrations/test_flow_control.rs @@ -15,7 +15,7 @@ use crate::{new_event_feed, TestSuiteBuilder}; fn test_cdc_congest() { let mut cluster = new_server_cluster(1, 1); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); let memory_quota = 1024; // 1KB let mut suite = TestSuiteBuilder::new() .cluster(cluster) diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 6443ffea158..f2663c79287 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -1,23 +1,28 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::*, time::Duration}; +use std::{ + sync::*, + time::{Duration, Instant}, +}; -use cdc::{recv_timeout, CdcObserver, FeatureGate, MemoryQuota, Task}; +use causal_ts::CausalTsProvider; +use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, MemoryQuota, Task, Validate}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; +use futures::executor::block_on; use grpcio::{ ChannelBuilder, ClientDuplexReceiver, ClientDuplexSender, ClientUnaryReceiver, Environment, }; use kvproto::{ cdcpb::{create_change_data, ChangeDataClient, ChangeDataEvent, ChangeDataRequest}, - kvrpcpb::*, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, tikvpb::TikvClient, }; use online_config::OnlineConfig; -use raftstore::coprocessor::CoprocessorHost; +use raftstore::{coprocessor::CoprocessorHost, router::CdcRaftRouter}; use test_raftstore::*; -use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID}; +use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID, storage::kv::LocalTablets}; use tikv_util::{ config::ReadableDuration, worker::{LazyWorker, Runnable}, @@ -177,17 +182,19 @@ impl TestSuiteBuilder { let mut cdc_endpoint = cdc::Endpoint::new( DEFAULT_CLUSTER_ID, &cfg, + false, cluster.cfg.storage.api_version(), pd_cli.clone(), worker.scheduler(), - raft_router, - cluster.engines[id].kv.clone(), + CdcRaftRouter(raft_router), + LocalTablets::Singleton(cluster.engines[id].kv.clone()), cdc_ob, cluster.store_metas[id].clone(), cm.clone(), env, sim.security_mgr.clone(), MemoryQuota::new(usize::MAX), + sim.get_causal_ts_provider(*id), ); let mut updated_cfg = cfg.clone(); updated_cfg.min_ts_interval = ReadableDuration::millis(100); @@ -230,7 +237,7 @@ impl TestSuite { pub fn new(count: usize, api_version: ApiVersion) -> TestSuite { let mut cluster = new_server_cluster_with_api_ver(1, count, api_version); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); // Disable background renew to make timestamp predictable. configure_for_causal_ts(&mut cluster, "0s", 1); @@ -263,9 +270,22 @@ impl TestSuite { muts: Vec, pk: Vec, ts: TimeStamp, + ) { + self.must_kv_prewrite_with_source(region_id, muts, pk, ts, 0); + } + + pub fn must_kv_prewrite_with_source( + &mut self, + region_id: u64, + muts: Vec, + pk: Vec, + ts: TimeStamp, + txn_source: u64, ) { let mut prewrite_req = PrewriteRequest::default(); - prewrite_req.set_context(self.get_context(region_id)); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + prewrite_req.set_context(context); prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; prewrite_req.start_version = ts.into_inner(); @@ -308,9 +328,22 @@ impl TestSuite { keys: Vec>, start_ts: TimeStamp, commit_ts: TimeStamp, + ) { + self.must_kv_commit_with_source(region_id, keys, start_ts, commit_ts, 0); + } + + pub fn must_kv_commit_with_source( + &mut self, + region_id: u64, + keys: Vec>, + start_ts: TimeStamp, + commit_ts: TimeStamp, + txn_source: u64, ) { let mut commit_req = CommitRequest::default(); - commit_req.set_context(self.get_context(region_id)); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + commit_req.set_context(context); commit_req.start_version = start_ts.into_inner(); commit_req.set_keys(keys.into_iter().collect()); commit_req.commit_version = commit_ts.into_inner(); @@ -417,7 +450,9 @@ impl TestSuite { prewrite_req.start_version = ts.into_inner(); prewrite_req.lock_ttl = prewrite_req.start_version + 1; prewrite_req.for_update_ts = for_update_ts.into_inner(); - prewrite_req.mut_is_pessimistic_lock().push(true); + prewrite_req + .mut_pessimistic_actions() + .push(DoPessimisticCheck); let prewrite_resp = self .get_tikv_client(region_id) .kv_prewrite(&prewrite_req) @@ -508,12 +543,93 @@ impl TestSuite { pub fn flush_causal_timestamp_for_region(&mut self, region_id: u64) { let leader = self.cluster.leader_of_region(region_id).unwrap(); - self.cluster - .sim - .rl() - .get_causal_ts_provider(leader.get_store_id()) - .unwrap() - .flush() + block_on( + self.cluster + .sim + .rl() + .get_causal_ts_provider(leader.get_store_id()) + .unwrap() + .async_flush(), + ) + .unwrap(); + } + + pub fn must_wait_delegate_condition( + &self, + region_id: u64, + cond: Arc) -> bool + Sync + Send>, + ) { + let scheduler = self.endpoints[®ion_id].scheduler(); + let start = Instant::now(); + loop { + sleep_ms(100); + let (tx, rx) = mpsc::sync_channel(1); + let c = cond.clone(); + let checker = move |d: Option<&Delegate>| { + tx.send(c(d)).unwrap(); + }; + scheduler + .schedule(Task::Validate(Validate::Region( + region_id, + Box::new(checker), + ))) + .unwrap(); + if rx.recv().unwrap() { + return; + } + if start.elapsed() > Duration::from_secs(5) { + panic!("wait delegate timeout"); + } + } + } + + pub fn must_kv_prepare_flashback( + &mut self, + region_id: u64, + start_key: &[u8], + end_key: &[u8], + start_ts: TimeStamp, + ) { + let mut prepare_flashback_req = PrepareFlashbackToVersionRequest::default(); + prepare_flashback_req.set_context(self.get_context(region_id)); + prepare_flashback_req.set_start_key(start_key.to_vec()); + prepare_flashback_req.set_end_key(end_key.to_vec()); + prepare_flashback_req.set_start_ts(start_ts.into_inner()); + let prepare_flashback_resp = self + .get_tikv_client(region_id) + .kv_prepare_flashback_to_version(&prepare_flashback_req) + .unwrap(); + assert!( + !prepare_flashback_resp.has_region_error(), + "{:?}", + prepare_flashback_resp.get_region_error() + ); + } + + pub fn must_kv_flashback( + &mut self, + region_id: u64, + start_key: &[u8], + end_key: &[u8], + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + ) { + let mut flashback_req = FlashbackToVersionRequest::default(); + flashback_req.set_context(self.get_context(region_id)); + flashback_req.set_start_key(start_key.to_vec()); + flashback_req.set_end_key(end_key.to_vec()); + flashback_req.set_start_ts(start_ts.into_inner()); + flashback_req.set_commit_ts(commit_ts.into_inner()); + flashback_req.set_version(version.into_inner()); + let flashback_resp = self + .get_tikv_client(region_id) + .kv_flashback_to_version(&flashback_req) .unwrap(); + assert!( + !flashback_resp.has_region_error(), + "{:?}", + flashback_resp.get_region_error() + ); } } diff --git a/components/cloud/Cargo.toml b/components/cloud/Cargo.toml index 5752f84e43c..10f8b113b2b 100644 --- a/components/cloud/Cargo.toml +++ b/components/cloud/Cargo.toml @@ -7,16 +7,16 @@ publish = false [dependencies] async-trait = "0.1" derive_more = "0.99.3" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } futures-io = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" openssl = "0.10" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } rusoto_core = "0.46.0" thiserror = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } url = "2.0" [dev-dependencies] diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 299192e9ca3..24518515ea0 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -9,33 +9,36 @@ failpoints = ["fail/failpoints"] [dependencies] async-trait = "0.1" +base64 = "0.13.0" bytes = "1.0" -cloud = { path = "../", default-features = false } +cloud = { workspace = true } fail = "0.5" futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } # This is only a dependency to vendor openssl for rusoto. It's not clear exactly # how openssl is built for tikv, but it seems to be controlled by grpcio. This # makes `cargo test -p aws` link correctly. -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } +lazy_static = "1.3" +md5 = "0.7.0" +prometheus = { version = "0.13", default-features = false, features = ["nightly"] } rusoto_core = "0.46.0" rusoto_credential = "0.46.0" rusoto_kms = { version = "0.46.0", features = ["serialize_structs"] } -rusoto_sts = "0.46.0" rusoto_s3 = { version = "0.46.0", features = ["serialize_structs"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +rusoto_sts = "0.46.0" +slog = { workspace = true } +slog-global = { workspace = true } +thiserror = "1.0" +tikv_util = { workspace = true } # better to not use slog-global, but pass in the logger tokio = { version = "1.5", features = ["time"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util", default-features = false } url = "2.0" -thiserror = "1.0" -lazy_static = "1.3" -prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +uuid = { version = "0.8", features = ["v4"] } [dev-dependencies] futures = "0.3" diff --git a/components/cloud/aws/src/kms.rs b/components/cloud/aws/src/kms.rs index 11ecf88ddd9..040db46bb53 100644 --- a/components/cloud/aws/src/kms.rs +++ b/components/cloud/aws/src/kms.rs @@ -82,11 +82,11 @@ impl KmsProvider for AwsKms { ENCRYPTION_VENDOR_NAME_AWS_KMS } - // On decrypt failure, the rule is to return WrongMasterKey error in case it is possible that - // a wrong master key has been used, or other error otherwise. + // On decrypt failure, the rule is to return WrongMasterKey error in case it is + // possible that a wrong master key has been used, or other error otherwise. async fn decrypt_data_key(&self, data_key: &EncryptedKey) -> Result> { let decrypt_request = DecryptRequest { - ciphertext_blob: bytes::Bytes::copy_from_slice(&*data_key), + ciphertext_blob: bytes::Bytes::copy_from_slice(data_key), // Use default algorithm SYMMETRIC_DEFAULT. encryption_algorithm: None, // Use key_id encoded in ciphertext. @@ -125,8 +125,8 @@ impl KmsProvider for AwsKms { } } -// Rusoto errors Display implementation just gives the cause message and discards the type. -// This is really bad when the cause message is empty! +// Rusoto errors Display implementation just gives the cause message and +// discards the type. This is really bad when the cause message is empty! // Use Debug instead: this will show both pub struct FixRusotoErrorDisplay( RusotoError, diff --git a/components/cloud/aws/src/lib.rs b/components/cloud/aws/src/lib.rs index 345302d0534..b6af7d64b48 100644 --- a/components/cloud/aws/src/lib.rs +++ b/components/cloud/aws/src/lib.rs @@ -5,6 +5,6 @@ mod kms; pub use kms::{AwsKms, ENCRYPTION_VENDOR_NAME_AWS_KMS}; mod s3; -pub use s3::{Config, S3Storage, STORAGE_VENDOR_NAME_AWS}; +pub use s3::{Config, S3Storage, STORAGE_NAME, STORAGE_VENDOR_NAME_AWS}; mod util; diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index b5cacb2266e..a7ea47ec9d2 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -17,14 +17,10 @@ use rusoto_core::{request::DispatchSignedRequest, ByteStream, RusotoError}; use rusoto_credential::{ProvideAwsCredentials, StaticProvider}; use rusoto_s3::{util::AddressingStyle, *}; use thiserror::Error; -use tikv_util::{ - debug, - stream::{error_stream, retry}, - time::Instant, -}; +use tikv_util::{debug, stream::error_stream, time::Instant}; use tokio::time::{sleep, timeout}; -use crate::util; +use crate::util::{self, retry_and_count}; const CONNECTION_TIMEOUT: Duration = Duration::from_secs(900); pub const STORAGE_VENDOR_NAME_AWS: &str = "aws"; @@ -54,6 +50,7 @@ pub struct Config { sse_kms_key_id: Option, storage_class: Option, multi_part_size: usize, + object_lock_enabled: bool, } impl Config { @@ -68,6 +65,7 @@ impl Config { sse_kms_key_id: None, storage_class: None, multi_part_size: MINIMUM_PART_SIZE, + object_lock_enabled: false, } } @@ -100,6 +98,7 @@ impl Config { force_path_style, sse_kms_key_id: StringNonEmpty::opt(attrs.get("sse_kms_key_id").unwrap_or(def).clone()), multi_part_size: MINIMUM_PART_SIZE, + object_lock_enabled: false, }) } @@ -132,6 +131,7 @@ impl Config { force_path_style: input.force_path_style, sse_kms_key_id: StringNonEmpty::opt(input.sse_kms_key_id), multi_part_size: MINIMUM_PART_SIZE, + object_lock_enabled: input.object_lock_enabled, }) } } @@ -221,6 +221,37 @@ impl S3Storage { } key.to_owned() } + + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { + let key = self.maybe_prefix_key(name); + let bucket = self.config.bucket.bucket.clone(); + debug!("read file from s3 storage"; "key" => %key); + let req = GetObjectRequest { + key, + bucket: (*bucket).clone(), + range, + ..Default::default() + }; + Box::new( + self.client + .get_object(req) + .map(move |future| match future { + Ok(out) => out.body.unwrap(), + Err(RusotoError::Service(GetObjectError::NoSuchKey(key))) => { + ByteStream::new(error_stream(io::Error::new( + io::ErrorKind::NotFound, + format!("no key {} at bucket {}", key, *bucket), + ))) + } + Err(e) => ByteStream::new(error_stream(io::Error::new( + io::ErrorKind::Other, + format!("failed to get object {}", e), + ))), + }) + .flatten_stream() + .into_async_read(), + ) + } } /// A helper for uploading a large files to S3 storage. @@ -236,6 +267,7 @@ struct S3Uploader<'client> { sse_kms_key_id: Option, storage_class: Option, multi_part_size: usize, + object_lock_enabled: bool, upload_id: String, parts: Vec, @@ -259,8 +291,9 @@ impl From> for UploadError { } /// try_read_exact tries to read exact length data as the buffer size. -/// like [`std::io::Read::read_exact`], but won't return `UnexpectedEof` when cannot read anything more from the `Read`. -/// once returning a size less than the buffer length, implies a EOF was meet, or nothing readed. +/// like [`std::io::Read::read_exact`], but won't return `UnexpectedEof` when +/// cannot read anything more from the `Read`. once returning a size less than +/// the buffer length, implies a EOF was meet, or nothing read. async fn try_read_exact( r: &mut R, buf: &mut [u8], @@ -278,12 +311,20 @@ async fn try_read_exact( } } +fn get_content_md5(object_lock_enabled: bool, content: &[u8]) -> Option { + object_lock_enabled.then(|| { + let digest = md5::compute(content); + base64::encode(digest.0) + }) +} + /// Specifies the minimum size to use multi-part upload. /// AWS S3 requires each part to be at least 5 MiB. const MINIMUM_PART_SIZE: usize = 5 * 1024 * 1024; impl<'client> S3Uploader<'client> { - /// Creates a new uploader with a given target location and upload configuration. + /// Creates a new uploader with a given target location and upload + /// configuration. fn new(client: &'client S3Client, config: &Config, key: String) -> Self { Self { client, @@ -294,6 +335,7 @@ impl<'client> S3Uploader<'client> { sse_kms_key_id: config.sse_kms_key_id.as_ref().cloned(), storage_class: config.storage_class.as_ref().cloned(), multi_part_size: config.multi_part_size, + object_lock_enabled: config.object_lock_enabled, upload_id: "".to_owned(), parts: Vec::new(), } @@ -309,11 +351,11 @@ impl<'client> S3Uploader<'client> { // For short files, execute one put_object to upload the entire thing. let mut data = Vec::with_capacity(est_len as usize); reader.read_to_end(&mut data).await?; - retry(|| self.upload(&data)).await?; + retry_and_count(|| self.upload(&data), "upload_small_file").await?; Ok(()) } else { // Otherwise, use multipart upload to improve robustness. - self.upload_id = retry(|| self.begin()).await?; + self.upload_id = retry_and_count(|| self.begin(), "begin_upload").await?; let upload_res = async { let mut buf = vec![0; self.multi_part_size]; let mut part_number = 1; @@ -322,7 +364,11 @@ impl<'client> S3Uploader<'client> { if data_size == 0 { break; } - let part = retry(|| self.upload_part(part_number, &buf[..data_size])).await?; + let part = retry_and_count( + || self.upload_part(part_number, &buf[..data_size]), + "upload_part", + ) + .await?; self.parts.push(part); part_number += 1; } @@ -331,9 +377,9 @@ impl<'client> S3Uploader<'client> { .await; if upload_res.is_ok() { - retry(|| self.complete()).await?; + retry_and_count(|| self.complete(), "complete_upload").await?; } else { - let _ = retry(|| self.abort()).await; + let _ = retry_and_count(|| self.abort(), "abort_upload").await; } upload_res } @@ -370,7 +416,8 @@ impl<'client> S3Uploader<'client> { } } - /// Completes a multipart upload process, asking S3 to join all parts into a single file. + /// Completes a multipart upload process, asking S3 to join all parts into a + /// single file. async fn complete(&self) -> Result<(), RusotoError> { let res = timeout( Self::get_timeout(), @@ -429,6 +476,7 @@ impl<'client> S3Uploader<'client> { upload_id: self.upload_id.clone(), part_number, content_length: Some(data.len() as i64), + content_md5: get_content_md5(self.object_lock_enabled, data), body: Some(data.to_vec().into()), ..Default::default() }) @@ -452,8 +500,8 @@ impl<'client> S3Uploader<'client> { /// Uploads a file atomically. /// - /// This should be used only when the data is known to be short, and thus relatively cheap to - /// retry the entire upload. + /// This should be used only when the data is known to be short, and thus + /// relatively cheap to retry the entire upload. async fn upload(&self, data: &[u8]) -> Result<(), RusotoError> { let res = timeout(Self::get_timeout(), async { #[cfg(feature = "failpoints")] @@ -471,7 +519,6 @@ impl<'client> S3Uploader<'client> { sleep(delay_duration).await; } - #[cfg(feature = "failpoints")] fail_point!("s3_put_obj_err", |_| { Err(RusotoError::ParseError("failed to put object".to_owned())) }); @@ -490,6 +537,7 @@ impl<'client> S3Uploader<'client> { ssekms_key_id: self.sse_kms_key_id.as_ref().map(|s| s.to_string()), storage_class: self.storage_class.as_ref().map(|s| s.to_string()), content_length: Some(data.len() as i64), + content_md5: get_content_md5(self.object_lock_enabled, data), body: Some(data.to_vec().into()), ..Default::default() }) @@ -515,7 +563,7 @@ impl<'client> S3Uploader<'client> { } } -const STORAGE_NAME: &str = "s3"; +pub const STORAGE_NAME: &str = "s3"; #[async_trait] impl BlobStorage for S3Storage { @@ -540,41 +588,20 @@ impl BlobStorage for S3Storage { } else { io::ErrorKind::Other }; - // Even we can check whether there is an `io::Error` internal and extract it directly, - // We still need to keep the message 'failed to put object' here for adapting the string-matching based - // retry logic in BR :( + // Even we can check whether there is an `io::Error` internal and extract it + // directly, We still need to keep the message 'failed to put object' here for + // adapting the string-matching based retry logic in BR :( io::Error::new(error_code, format!("failed to put object {}", e)) }) } - fn get(&self, name: &str) -> Box { - let key = self.maybe_prefix_key(name); - let bucket = self.config.bucket.bucket.clone(); - debug!("read file from s3 storage"; "key" => %key); - let req = GetObjectRequest { - key, - bucket: (*bucket).clone(), - ..Default::default() - }; - Box::new( - self.client - .get_object(req) - .map(move |future| match future { - Ok(out) => out.body.unwrap(), - Err(RusotoError::Service(GetObjectError::NoSuchKey(key))) => { - ByteStream::new(error_stream(io::Error::new( - io::ErrorKind::NotFound, - format!("no key {} at bucket {}", key, *bucket), - ))) - } - Err(e) => ByteStream::new(error_stream(io::Error::new( - io::ErrorKind::Other, - format!("failed to get object {}", e), - ))), - }) - .flatten_stream() - .into_async_read(), - ) + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { + self.get_range(name, None) + } + + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { + // inclusive, bytes=0-499 -> [0, 499] + self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } } @@ -588,6 +615,18 @@ mod tests { use super::*; + #[test] + fn test_s3_get_content_md5() { + // base64 encode md5sum "helloworld" + let code = "helloworld".to_string(); + let expect = "/F4DjTilcDIIVEHn/nAQsA==".to_string(); + let actual = get_content_md5(true, code.as_bytes()).unwrap(); + assert_eq!(actual, expect); + + let actual = get_content_md5(false, b"xxx"); + assert!(actual.is_none()) + } + #[test] fn test_s3_config() { let bucket_name = StringNonEmpty::required("mybucket".to_string()).unwrap(); @@ -628,7 +667,8 @@ mod tests { // set multi_part_size to use upload_part function config.multi_part_size = multi_part_size; - // split magic_contents into 3 parts, so we mock 5 requests here(1 begin + 3 part + 1 complete) + // split magic_contents into 3 parts, so we mock 5 requests here(1 begin + 3 + // part + 1 complete) let dispatcher = MultipleMockRequestDispatcher::new(vec![ MockRequestDispatcher::with_status(200).with_body( r#" @@ -647,14 +687,13 @@ mod tests { let s = S3Storage::new_creds_dispatcher(config, dispatcher, credentials_provider).unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; - assert!(resp.is_ok()); + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap(); assert_eq!( CLOUD_REQUEST_HISTOGRAM_VEC .get_metric_with_label_values(&["s3", "upload_part"]) @@ -704,15 +743,14 @@ mod tests { // inject put error let s3_put_obj_err_fp = "s3_put_obj_err"; fail::cfg(s3_put_obj_err_fp, "return").unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap_err(); fail::remove(s3_put_obj_err_fp); - assert!(resp.is_err()); // test timeout let s3_timeout_injected_fp = "s3_timeout_injected"; @@ -722,30 +760,27 @@ mod tests { fail::cfg(s3_timeout_injected_fp, "return(100)").unwrap(); // inject 200ms delay fail::cfg(s3_sleep_injected_fp, "return(200)").unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; - fail::remove(s3_sleep_injected_fp); // timeout occur due to delay 200ms - assert!(resp.is_err()); + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap_err(); + fail::remove(s3_sleep_injected_fp); // inject 50ms delay fail::cfg(s3_sleep_injected_fp, "return(50)").unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap(); fail::remove(s3_sleep_injected_fp); fail::remove(s3_timeout_injected_fp); - // no timeout - assert!(resp.is_ok()); } #[test] @@ -904,7 +939,8 @@ mod tests { use self::try_read_exact; - /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each `read` call. + /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each + /// `read` call. struct ThrottleRead(R); impl Read for ThrottleRead { fn read(&mut self, buf: &mut [u8]) -> io::Result { diff --git a/components/cloud/aws/src/util.rs b/components/cloud/aws/src/util.rs index c4ff356f462..a2dc1ca8c76 100644 --- a/components/cloud/aws/src/util.rs +++ b/components/cloud/aws/src/util.rs @@ -3,6 +3,8 @@ use std::io::{self, Error, ErrorKind}; use async_trait::async_trait; +use cloud::metrics; +use futures::{future::TryFutureExt, Future}; use rusoto_core::{ region::Region, request::{HttpClient, HttpConfig}, @@ -11,10 +13,36 @@ use rusoto_credential::{ AutoRefreshingProvider, AwsCredentials, ChainProvider, CredentialsError, ProvideAwsCredentials, }; use rusoto_sts::WebIdentityProvider; +use tikv_util::{ + stream::{retry_ext, RetryError, RetryExt}, + warn, +}; #[allow(dead_code)] // This will be used soon, please remove the allow. const READ_BUF_SIZE: usize = 1024 * 1024 * 2; +const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE"; +struct CredentialsErrorWrapper(CredentialsError); + +impl From for CredentialsError { + fn from(c: CredentialsErrorWrapper) -> CredentialsError { + c.0 + } +} + +impl std::fmt::Display for CredentialsErrorWrapper { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0.message)?; + Ok(()) + } +} + +impl RetryError for CredentialsErrorWrapper { + fn is_retryable(&self) -> bool { + true + } +} + pub fn new_http_client() -> io::Result { let mut http_config = HttpConfig::new(); // This can greatly improve performance dealing with payloads greater @@ -49,6 +77,22 @@ pub fn get_region(region: &str, endpoint: &str) -> io::Result { } } +pub async fn retry_and_count(action: G, name: &'static str) -> Result +where + G: FnMut() -> F, + F: Future>, + E: RetryError + std::fmt::Display, +{ + let id = uuid::Uuid::new_v4(); + retry_ext( + action, + RetryExt::default().with_fail_hook(move |err: &E| { + warn!("aws request meet error."; "err" => %err, "retry?" => %err.is_retryable(), "context" => %name, "uuid" => %id); + metrics::CLOUD_ERROR_VEC.with_label_values(&["aws", name]).inc(); + }), + ).await +} + pub struct CredentialsProvider(AutoRefreshingProvider); impl CredentialsProvider { @@ -92,21 +136,81 @@ impl Default for DefaultCredentialsProvider { #[async_trait] impl ProvideAwsCredentials for DefaultCredentialsProvider { async fn credentials(&self) -> Result { - // Prefer the web identity provider first for the kubernetes environment. - // Search for both in parallel. - let web_creds = self.web_identity_provider.credentials(); - let def_creds = self.default_provider.credentials(); - let k8s_error = match web_creds.await { - res @ Ok(_) => return res, - Err(e) => e, - }; - let def_error = match def_creds.await { - res @ Ok(_) => return res, - Err(e) => e, + // use web identity provider first for the kubernetes environment. + let cred = if std::env::var(AWS_WEB_IDENTITY_TOKEN_FILE).is_ok() { + // we need invoke assume_role in web identity provider + // this API may failed sometimes. + // according to AWS experience, it's better to retry it with 10 times + // exponential backoff for every error, because we cannot + // distinguish the error type. + retry_and_count( + || { + #[cfg(test)] + fail::fail_point!("cred_err", |_| { + Box::pin(futures::future::err(CredentialsErrorWrapper( + CredentialsError::new("injected error"), + ))) + as std::pin::Pin + Send>> + }); + let res = self + .web_identity_provider + .credentials() + .map_err(|e| CredentialsErrorWrapper(e)); + #[cfg(test)] + return Box::pin(res); + #[cfg(not(test))] + res + }, + "get_cred_over_the_cloud", + ) + .await + .map_err(|e| e.0) + } else { + // Add exponential backoff for every error, because we cannot + // distinguish the error type. + retry_and_count( + || { + self.default_provider + .credentials() + .map_err(|e| CredentialsErrorWrapper(e)) + }, + "get_cred_on_premise", + ) + .await + .map_err(|e| e.0) }; - Err(CredentialsError::new(format_args!( - "Couldn't find AWS credentials in default sources ({}) or k8s environment ({}).", - def_error.message, k8s_error.message, - ))) + + cred.map_err(|e| { + CredentialsError::new(format_args!( + "Couldn't find AWS credentials in sources ({}).", + e.message + )) + }) + } +} + +#[cfg(test)] +mod tests { + #[allow(unused_imports)] + use super::*; + + #[cfg(feature = "failpoints")] + #[tokio::test] + async fn test_default_provider() { + let default_provider = DefaultCredentialsProvider::default(); + std::env::set_var(AWS_WEB_IDENTITY_TOKEN_FILE, "tmp"); + // mock k8s env with web_identitiy_provider + fail::cfg("cred_err", "return").unwrap(); + fail::cfg("retry_count", "return(1)").unwrap(); + let res = default_provider.credentials().await; + assert_eq!(res.is_err(), true); + assert_eq!( + res.err().unwrap().message, + "Couldn't find AWS credentials in sources (injected error)." + ); + fail::remove("cred_err"); + fail::remove("retry_count"); + + std::env::remove_var(AWS_WEB_IDENTITY_TOKEN_FILE); } } diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 042898c31d5..b9ba7732e9e 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -6,18 +6,25 @@ publish = false [dependencies] async-trait = "0.1" -azure_core = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust"} -azure_identity = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust" } -azure_storage = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false, features = ["account", "blob"] } +azure_core = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust" } +azure_identity = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust" } +azure_storage = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false } +azure_storage_blobs = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust" } base64 = "0.13" -chrono = "0.4" -cloud = { path = "../", default-features = false } +cloud = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } +lazy_static = "1.4.0" oauth2 = { version = "4.0.0", default-features = false } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util", default-features = false } +openssl = { version = "0.10.50" } +regex = "1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +slog = { workspace = true } +slog-global = { workspace = true } +tikv_util = { workspace = true } +time = { version = "0.3", features = ["local-offset"] } tokio = { version = "1.5", features = ["time"] } url = "2.0" +uuid = { version = "1.0", features = ["v4"] } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index c322f1d0edc..7f7483a3e8a 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -8,17 +8,15 @@ use std::{ use async_trait::async_trait; use azure_core::{ auth::{TokenCredential, TokenResponse}, - prelude::*, + new_http_client, }; -use azure_identity::token_credentials::{ClientSecretCredential, TokenCredentialOptions}; -use azure_storage::{ - blob::prelude::*, - core::{prelude::*, ConnectionStringBuilder}, -}; -use chrono::{Duration as ChronoDuration, Utc}; +use azure_identity::{ClientSecretCredential, TokenCredentialOptions}; +use azure_storage::{prelude::*, ConnectionString, ConnectionStringBuilder}; +use azure_storage_blobs::prelude::*; use cloud::blob::{ none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty, }; +use futures::TryFutureExt; use futures_util::{ io::{AsyncRead, AsyncReadExt}, stream, @@ -26,11 +24,14 @@ use futures_util::{ TryStreamExt, }; pub use kvproto::brpb::{AzureBlobStorage as InputConfig, Bucket as InputBucket, CloudDynamic}; +use lazy_static::lazy_static; use oauth2::{ClientId, ClientSecret}; +use regex::Regex; use tikv_util::{ debug, stream::{retry, RetryError}, }; +use time::OffsetDateTime; use tokio::{ sync::Mutex, time::{timeout, Duration}, @@ -224,6 +225,7 @@ impl BlobConfig for Config { enum RequestError { InvalidInput(Box, String), + InternalError(String), TimeOut(String), } @@ -233,6 +235,7 @@ impl From for io::Error { RequestError::InvalidInput(e, tag) => { Self::new(io::ErrorKind::InvalidInput, format!("{}: {}", tag, &e)) } + RequestError::InternalError(msg) => Self::new(io::ErrorKind::Other, msg), RequestError::TimeOut(msg) => Self::new(io::ErrorKind::TimedOut, msg), } } @@ -240,15 +243,24 @@ impl From for io::Error { impl RetryError for RequestError { fn is_retryable(&self) -> bool { - matches!(self, Self::TimeOut(_)) + matches!(self, Self::TimeOut(_) | Self::InternalError(_)) + } +} + +fn err_is_retryable(err_info: &str) -> bool { + // HTTP Code 503: The server is busy + // HTTP Code 500: Operation could not be completed within the specified time. + // More details seen in https://learn.microsoft.com/en-us/rest/api/storageservices/blob-service-error-codes + lazy_static! { + static ref RE: Regex = Regex::new(r"status: 5[0-9][0-9],").unwrap(); } + + RE.is_match(err_info) } const CONNECTION_TIMEOUT: Duration = Duration::from_secs(900); /// A helper for uploading a large file to Azure storage. -/// -/// struct AzureUploader { client_builder: Arc, name: String, @@ -257,7 +269,8 @@ struct AzureUploader { } impl AzureUploader { - /// Creates a new uploader with a given target location and upload configuration. + /// Creates a new uploader with a given target location and upload + /// configuration. fn new(client_builder: Arc, config: &Config, name: String) -> Self { AzureUploader { client_builder, @@ -288,18 +301,17 @@ impl AzureUploader { /// Uploads a file atomically. /// - /// This should be used only when the data is known to be short, and thus relatively cheap to - /// retry the entire upload. + /// This should be used only when the data is known to be short, and thus + /// relatively cheap to retry the entire upload. async fn upload(&self, data: &[u8]) -> Result<(), RequestError> { match timeout(Self::get_timeout(), async { self.client_builder .get_client() .await .map_err(|e| e.to_string())? - .as_blob_client(&self.name) + .blob_client(&self.name) .put_block_blob(data.to_vec()) .access_tier(self.storage_class) - .execute() .await?; Ok(()) }) @@ -309,10 +321,9 @@ impl AzureUploader { Ok(_) => Ok(()), Err(err) => { let err_info = ToString::to_string(&err); - if err_info.contains("busy") { - // server is busy, retry later - Err(RequestError::TimeOut(format!( - "the resource is busy: {}, retry later", + if err_is_retryable(&err_info) { + Err(RequestError::InternalError(format!( + "internal error: {}, retry later", err_info ))) } else { @@ -401,13 +412,13 @@ impl ContainerBuilder for TokenCredContainerBuilder { { let token_response = self.token_cache.read().unwrap(); if let Some(ref t) = *token_response { - let interval = t.0.expires_on - Utc::now(); + let interval = (t.0.expires_on - OffsetDateTime::now_utc()).whole_minutes(); // keep token updated 5 minutes before it expires - if interval > ChronoDuration::minutes(TOKEN_UPDATE_LEFT_TIME_MINS) { + if interval > TOKEN_UPDATE_LEFT_TIME_MINS { return Ok(t.1.clone()); } - if interval > ChronoDuration::minutes(TOKEN_EXPIRE_LEFT_TIME_MINS) { + if interval > TOKEN_EXPIRE_LEFT_TIME_MINS { // there still have time to use the token, // and only need one thread to update token. if let Ok(l) = self.modify_place.try_lock() { @@ -430,9 +441,9 @@ impl ContainerBuilder for TokenCredContainerBuilder { { let token_response = self.token_cache.read().unwrap(); if let Some(ref t) = *token_response { - let interval = t.0.expires_on - Utc::now(); + let interval = (t.0.expires_on - OffsetDateTime::now_utc()).whole_minutes(); // token is already updated - if interval > ChronoDuration::minutes(TOKEN_UPDATE_LEFT_TIME_MINS) { + if interval > TOKEN_UPDATE_LEFT_TIME_MINS { return Ok(t.1.clone()); } } @@ -444,14 +455,12 @@ impl ContainerBuilder for TokenCredContainerBuilder { .get_token(&self.token_resource) .await .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", &e)))?; - let http_client = new_http_client(); - let storage_client = StorageAccountClient::new_bearer_token( - http_client, + let blob_service = BlobServiceClient::new( self.account_name.clone(), - token.token.secret(), - ) - .as_storage_client() - .as_container_client(self.container_name.clone()); + StorageCredentials::BearerToken(token.token.secret().into()), + ); + let storage_client = + Arc::new(blob_service.container_client(self.container_name.clone())); { let mut token_response = self.token_cache.write().unwrap(); @@ -480,22 +489,54 @@ impl AzureStorage { Self::new(Config::from_input(input)?) } + /// Mock a dummpy AzureStorage with a shared key Config for + /// testing by Azurite tool. + /// + /// This function should only be used for testing Blob with a + /// local Azurite server. + #[cfg(test)] + #[allow(dead_code)] + fn from_dummy_input(input: InputConfig) -> io::Result { + let config = Config::from_input(input)?; + let bucket = (*config.bucket.bucket).to_owned(); + Ok(AzureStorage { + config, + client_builder: Arc::new(SharedKeyContainerBuilder { + container_client: Arc::new( + ClientBuilder::emulator() + .blob_service_client() + .container_client(bucket), + ), + }), + }) + } + pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { Self::new(Config::from_cloud_dynamic(cloud_dynamic)?) } pub fn new(config: Config) -> io::Result { + let bucket = (*config.bucket.bucket).to_owned(); // priority: explicit shared key > env Azure AD > env shared key if let Some(connection_string) = config.parse_plaintext_account_url() { - let bucket = (*config.bucket.bucket).to_owned(); - let http_client = new_http_client(); - let container_client = StorageAccountClient::new_connection_string( - http_client.clone(), - connection_string.as_str(), - ) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", &e)))? - .as_storage_client() - .as_container_client(bucket); + let account_name = config.get_account_name()?; + let storage_credentials = ConnectionString::new(&connection_string) + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid configurations for SharedKey, err: {}", e), + ) + })? + .storage_credentials() + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid credentials for blob, err: {}", e), + ) + })?; + let container_client = Arc::new( + BlobServiceClient::new(account_name, storage_credentials).container_client(bucket), + ); let client_builder = Arc::new(SharedKeyContainerBuilder { container_client }); Ok(AzureStorage { @@ -503,10 +544,10 @@ impl AzureStorage { client_builder, }) } else if let Some(credential_info) = config.credential_info.as_ref() { - let bucket = (*config.bucket.bucket).to_owned(); let account_name = config.get_account_name()?; let token_resource = format!("https://{}.blob.core.windows.net", &account_name); let cred = ClientSecretCredential::new( + new_http_client(), credential_info.tenant_id.clone(), credential_info.client_id.to_string(), credential_info.client_secret.secret().clone(), @@ -525,15 +566,24 @@ impl AzureStorage { client_builder, }) } else if let Some(connection_string) = config.parse_env_plaintext_account_url() { - let bucket = (*config.bucket.bucket).to_owned(); - let http_client = new_http_client(); - let container_client = StorageAccountClient::new_connection_string( - http_client.clone(), - connection_string.as_str(), - ) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", &e)))? - .as_storage_client() - .as_container_client(bucket); + let account_name = config.get_account_name()?; + let storage_credentials = ConnectionString::new(&connection_string) + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invald configurations for SharedKey from ENV, err: {}", e), + ) + })? + .storage_credentials() + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid credentials for blob, err: {}", e), + ) + })?; + let container_client = Arc::new( + BlobServiceClient::new(account_name, storage_credentials).container_client(bucket), + ); let client_builder = Arc::new(SharedKeyContainerBuilder { container_client }); Ok(AzureStorage { @@ -554,6 +604,38 @@ impl AzureStorage { } key.to_owned() } + + fn get_range( + &self, + name: &str, + range: Option>, + ) -> cloud::blob::BlobStream<'_> { + let name = self.maybe_prefix_key(name); + debug!("read file from Azure storage"; "key" => %name); + let t = async move { + let blob_client = self.client_builder.get_client().await?.blob_client(name); + + let builder = if let Some(r) = range { + blob_client.get().range(r) + } else { + blob_client.get() + }; + + let mut chunk: Vec = vec![]; + let mut stream = builder.into_stream(); + while let Some(value) = stream.next().await { + let value = value?.data.collect().await?; + chunk.extend(&value); + } + azure_core::Result::Ok(chunk) + }; + let stream = stream::once( + t.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", e))), + ) + .boxed() + .into_async_read(); + Box::new(stream) + } } #[async_trait] @@ -576,23 +658,12 @@ impl BlobStorage for AzureStorage { uploader.run(&mut reader, content_length).await } - fn get(&self, name: &str) -> Box { - let name = self.maybe_prefix_key(name); - debug!("read file from Azure storage"; "key" => %name); - let t = async move { - self.client_builder - .get_client() - .await? - .as_blob_client(name) - .get() - .execute() - .await - .map(|res| res.data) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", e))) - }; - let k = stream::once(t); - let t = k.boxed().into_async_read(); - Box::new(t) + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { + self.get_range(name, None) + } + + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { + self.get_range(name, Some(off..off + len)) } } @@ -687,7 +758,7 @@ mod tests { input.set_endpoint("http://127.0.0.1:10000/devstoreaccount1".to_owned()); input.set_prefix("backup 01/prefix/".to_owned()); - let storage = AzureStorage::from_input(input).unwrap(); + let storage = AzureStorage::from_dummy_input(input).unwrap(); assert_eq!(storage.maybe_prefix_key("t"), "backup 01/prefix/t"); let mut magic_contents = String::new(); for _ in 0..4096 { @@ -750,4 +821,24 @@ mod tests { cd.set_bucket(bucket); cd } + + #[tokio::test] + async fn test_error_retryable() { + let err_info = "HTTP error status (status: 503,... The server is busy."; + assert!(err_is_retryable(err_info)); + let err_info = "HTTP error status (status: 500,... Operation could not be completed within the specified time."; + assert!(err_is_retryable(err_info)); + let err_info = + "HTTP error status (status: 409,... The blob type is invalid for this operation."; + assert!(!err_is_retryable(err_info)); + let err_info = "HTTP error status (status: 50,... "; + assert!(!err_is_retryable(err_info)); + let err = "NaN".parse::().unwrap_err(); + let err1 = RequestError::InvalidInput(Box::new(err), "invalid-input".to_owned()); + let err2 = RequestError::InternalError("internal-error".to_owned()); + let err3 = RequestError::TimeOut("time-out".to_owned()); + assert!(!err1.is_retryable()); + assert!(err2.is_retryable()); + assert!(err3.is_retryable()); + } } diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index a9045d6f27c..4c3b8994ffc 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -5,21 +5,23 @@ edition = "2018" publish = false [dependencies] -futures-util = { version = "0.3", default-features = false, features = ["io"] } async-trait = "0.1" +cloud = { workspace = true } +futures-util = { version = "0.3", default-features = false, features = ["io"] } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +kvproto = { workspace = true } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tame-gcs = { version = "0.10", features = ["async-multipart"] } tame-oauth = "0.4.7" -cloud = { path = "../", default-features = false } -tikv_util = { path = "../../tikv_util", default-features = false } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time"] } url = "2.0" [dev-dependencies] matches = "0.1.8" +pin-project = "1" +tokio = { version = "1.5", features = ["rt"] } diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index 08ee60a52bf..61e432c9431 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -2,14 +2,16 @@ use std::{convert::TryInto, fmt::Display, io, sync::Arc}; use async_trait::async_trait; -use cloud::blob::{ - none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty, +use cloud::{ + blob::{none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty}, + metrics, }; use futures_util::{ future::TryFutureExt, - io::{AsyncRead, AsyncReadExt, Cursor}, + io::{self as async_io, AsyncRead, Cursor}, stream::{StreamExt, TryStreamExt}, }; +use http::HeaderValue; use hyper::{client::HttpConnector, Body, Client, Request, Response, StatusCode}; use hyper_tls::HttpsConnector; pub use kvproto::brpb::{Bucket as InputBucket, CloudDynamic, Gcs as InputConfig}; @@ -19,7 +21,12 @@ use tame_gcs::{ types::{BucketName, ObjectId}, }; use tame_oauth::gcp::{ServiceAccountAccess, ServiceAccountInfo, TokenOrRequest}; -use tikv_util::stream::{error_stream, retry, AsyncReadAsSyncStreamOfBytes, RetryError}; +use tikv_util::{ + stream::{error_stream, AsyncReadAsSyncStreamOfBytes, RetryError}, + time::Instant, +}; + +use crate::utils::retry; const GOOGLE_APIS: &str = "https://www.googleapis.com"; const HARDCODED_ENDPOINTS_SUFFIX: &[&str] = &["upload/storage/v1/", "storage/v1/"]; @@ -127,7 +134,7 @@ impl BlobConfig for Config { // GCS compatible storage #[derive(Clone)] -pub struct GCSStorage { +pub struct GcsStorage { config: Config, svc_access: Option>, client: Client, Body>, @@ -155,6 +162,7 @@ impl ResultExt for Result { } } +#[derive(Debug)] enum RequestError { Hyper(hyper::Error, String), OAuth(tame_oauth::Error, String), @@ -228,7 +236,7 @@ impl RetryError for RequestError { } } -impl GCSStorage { +impl GcsStorage { pub fn from_input(input: InputConfig) -> io::Result { Self::new(Config::from_input(input)?) } @@ -238,7 +246,7 @@ impl GCSStorage { } /// Create a new GCS storage for the given config. - pub fn new(config: Config) -> io::Result { + pub fn new(config: Config) -> io::Result { let svc_access = if let Some(si) = &config.svc_info { Some( ServiceAccountAccess::new(si.clone()) @@ -249,7 +257,7 @@ impl GCSStorage { }; let client = Client::builder().build(HttpsConnector::new()); - Ok(GCSStorage { + Ok(GcsStorage { config, svc_access: svc_access.map(Arc::new), client, @@ -339,12 +347,55 @@ impl GCSStorage { Ok(res) } - fn error_to_async_read(kind: io::ErrorKind, e: E) -> Box + fn error_to_async_read(kind: io::ErrorKind, e: E) -> cloud::blob::BlobStream<'static> where E: Into>, { Box::new(error_stream(io::Error::new(kind, e)).into_async_read()) } + + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { + let bucket = self.config.bucket.bucket.to_string(); + let name = self.maybe_prefix_key(name); + debug!("read file from GCS storage"; "key" => %name); + let oid = match ObjectId::new(bucket, name) { + Ok(oid) => oid, + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), + }; + let mut request = match Object::download(&oid, None /* optional */) { + Ok(request) => request.map(|_: io::Empty| Body::empty()), + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::Other, e), + }; + if let Some(r) = range { + let header_value = match HeaderValue::from_str(&r) { + Ok(v) => v, + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::Other, e), + }; + request.headers_mut().insert("Range", header_value); + } + Box::new( + self.make_request(request, tame_gcs::Scopes::ReadOnly) + .and_then(|response| async { + if response.status().is_success() { + Ok(response.into_body().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("download from GCS error: {}", e), + ) + })) + } else { + Err(status_code_error( + response.status(), + "bucket read".to_string(), + )) + } + }) + .err_into::() + .try_flatten_stream() + .boxed() // this `.boxed()` pin the stream. + .into_async_read(), + ) + } } fn change_host(host: &StringNonEmpty, url: &str) -> Option { @@ -389,20 +440,23 @@ fn parse_predefined_acl(acl: &str) -> Result, &str> { })) } +/// Like AsyncReadExt::read_to_end, but only try to initialize the buffer once. +/// Check https://github.com/rust-lang/futures-rs/issues/2658 for the reason we cannot +/// directly use it. +async fn read_to_end(r: R, v: &mut Vec) -> std::io::Result { + let mut c = Cursor::new(v); + async_io::copy(r, &mut c).await +} + const STORAGE_NAME: &str = "gcs"; #[async_trait] -impl BlobStorage for GCSStorage { +impl BlobStorage for GcsStorage { fn config(&self) -> Box { Box::new(self.config.clone()) as Box } - async fn put( - &self, - name: &str, - mut reader: PutResource, - content_length: u64, - ) -> io::Result<()> { + async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()> { if content_length == 0 { // It is probably better to just write the empty file // However, currently going forward results in a body write aborted error @@ -424,69 +478,57 @@ impl BlobStorage for GCSStorage { ..Default::default() }; - // FIXME: Switch to upload() API so we don't need to read the entire data into memory - // in order to retry. + // FIXME: Switch to upload() API so we don't need to read the entire data into + // memory in order to retry. + let begin = Instant::now_coarse(); let mut data = Vec::with_capacity(content_length as usize); - reader.read_to_end(&mut data).await?; - retry(|| async { - let data = Cursor::new(data.clone()); - let req = Object::insert_multipart( - &bucket, - data, - content_length, - &metadata, - Some(InsertObjectOptional { - predefined_acl: self.config.predefined_acl, - ..Default::default() - }), - ) - .map_err(RequestError::Gcs)? - .map(|reader| Body::wrap_stream(AsyncReadAsSyncStreamOfBytes::new(reader))); - self.make_request(req, tame_gcs::Scopes::ReadWrite).await - }) + read_to_end(reader, &mut data).await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "read_local"]) + .observe(begin.saturating_elapsed_secs()); + let begin = Instant::now_coarse(); + retry( + || async { + let data = Cursor::new(data.clone()); + let req = Object::insert_multipart( + &bucket, + data, + content_length, + &metadata, + Some(InsertObjectOptional { + predefined_acl: self.config.predefined_acl, + ..Default::default() + }), + ) + .map_err(RequestError::Gcs)? + .map(|reader| Body::wrap_stream(AsyncReadAsSyncStreamOfBytes::new(reader))); + self.make_request(req, tame_gcs::Scopes::ReadWrite).await + }, + "insert_multipart", + ) .await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "insert_multipart"]) + .observe(begin.saturating_elapsed_secs()); Ok::<_, io::Error>(()) } - fn get(&self, name: &str) -> Box { - let bucket = self.config.bucket.bucket.to_string(); - let name = self.maybe_prefix_key(name); - debug!("read file from GCS storage"; "key" => %name); - let oid = match ObjectId::new(bucket, name) { - Ok(oid) => oid, - Err(e) => return GCSStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), - }; - let request = match Object::download(&oid, None /*optional*/) { - Ok(request) => request.map(|_: io::Empty| Body::empty()), - Err(e) => return GCSStorage::error_to_async_read(io::ErrorKind::Other, e), - }; - Box::new( - self.make_request(request, tame_gcs::Scopes::ReadOnly) - .and_then(|response| async { - if response.status().is_success() { - Ok(response.into_body().map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("download from GCS error: {}", e), - ) - })) - } else { - Err(status_code_error( - response.status(), - "bucket read".to_string(), - )) - } - }) - .err_into::() - .try_flatten_stream() - .boxed() // this `.boxed()` pin the stream. - .into_async_read(), - ) + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { + self.get_range(name, None) + } + + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { + // inclusive, bytes=0-499 -> [0, 499] + self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } } #[cfg(test)] mod tests { + extern crate test; + use std::task::Poll; + + use futures_util::AsyncReadExt; use matches::assert_matches; use super::*; @@ -588,6 +630,84 @@ mod tests { assert_eq!(c1.bucket.prefix, c2.bucket.prefix); } + enum ThrottleReadState { + Spawning, + Emitting, + } + /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each + /// `read` call. This is copy & paste from the implmentation from s3.rs. + #[pin_project::pin_project] + struct ThrottleRead { + #[pin] + inner: R, + state: ThrottleReadState, + } + impl AsyncRead for ThrottleRead { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut [u8], + ) -> Poll> { + let this = self.project(); + match this.state { + ThrottleReadState::Spawning => { + *this.state = ThrottleReadState::Emitting; + cx.waker().wake_by_ref(); + Poll::Pending + } + ThrottleReadState::Emitting => { + *this.state = ThrottleReadState::Spawning; + this.inner.poll_read(cx, &mut buf[..2]) + } + } + } + } + impl ThrottleRead { + fn new(r: R) -> Self { + Self { + inner: r, + state: ThrottleReadState::Spawning, + } + } + } + + const BENCH_READ_SIZE: usize = 128 * 1024; + + // 255,120,895 ns/iter (+/- 73,332,249) (futures-util 0.3.15) + #[bench] + fn bench_read_to_end(b: &mut test::Bencher) { + let mut v = [0; BENCH_READ_SIZE]; + let mut dst = Vec::with_capacity(BENCH_READ_SIZE); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + b.iter(|| { + let mut r = ThrottleRead::new(Cursor::new(&mut v)); + dst.clear(); + + rt.block_on(r.read_to_end(&mut dst)).unwrap(); + assert_eq!(dst.len(), BENCH_READ_SIZE) + }) + } + + // 5,850,042 ns/iter (+/- 3,787,438) + #[bench] + fn bench_manual_read_to_end(b: &mut test::Bencher) { + let mut v = [0; BENCH_READ_SIZE]; + let mut dst = Vec::with_capacity(BENCH_READ_SIZE); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + b.iter(|| { + let r = ThrottleRead::new(Cursor::new(&mut v)); + dst.clear(); + + rt.block_on(read_to_end(r, &mut dst)).unwrap(); + assert_eq!(dst.len(), BENCH_READ_SIZE) + }) + } + fn cloud_dynamic_from_input(mut gcs: InputConfig) -> CloudDynamic { let mut bucket = InputBucket::default(); if !gcs.endpoint.is_empty() { diff --git a/components/cloud/gcp/src/lib.rs b/components/cloud/gcp/src/lib.rs index e023ca9c6eb..9ad97793988 100644 --- a/components/cloud/gcp/src/lib.rs +++ b/components/cloud/gcp/src/lib.rs @@ -1,7 +1,26 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(test)] #[macro_use] extern crate slog_global; mod gcs; -pub use gcs::{Config, GCSStorage}; +pub use gcs::{Config, GcsStorage}; + +pub mod utils { + use std::future::Future; + + use cloud::metrics; + use tikv_util::stream::{retry_ext, RetryError, RetryExt}; + pub async fn retry(action: G, name: &'static str) -> Result + where + G: FnMut() -> F, + F: Future>, + E: RetryError + std::fmt::Debug, + { + retry_ext(action, RetryExt::default().with_fail_hook(move |err: &E| { + warn!("gcp request meet error."; "err" => ?err, "retry?" => %err.is_retryable(), "context" => %name); + metrics::CLOUD_ERROR_VEC.with_label_values(&["gcp", name]).inc(); + })).await + } +} diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index 4685b5ae851..84ca77042d7 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -15,9 +15,12 @@ pub trait BlobConfig: 'static + Send + Sync { /// It is identity to [external_storage::UnpinReader], /// only for decoupling external_storage and cloud package. /// -/// See the documentation of [external_storage::UnpinReader] for why those wrappers exists. +/// See the documentation of [external_storage::UnpinReader] for why those +/// wrappers exists. pub struct PutResource(pub Box); +pub type BlobStream<'a> = Box; + impl AsyncRead for PutResource { fn poll_read( self: Pin<&mut Self>, @@ -44,7 +47,10 @@ pub trait BlobStorage: 'static + Send + Sync { async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()>; /// Read all contents of the given path. - fn get(&self, name: &str) -> Box; + fn get(&self, name: &str) -> BlobStream<'_>; + + /// Read part of contents of the given path. + fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_>; } impl BlobConfig for dyn BlobStorage { @@ -68,9 +74,13 @@ impl BlobStorage for Box { fut.await } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> BlobStream<'_> { (**self).get(name) } + + fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_> { + (**self).get_part(name, off, len) + } } #[derive(Clone, Debug, PartialEq)] diff --git a/components/cloud/src/metrics.rs b/components/cloud/src/metrics.rs index e115abe0853..58e267a56fa 100644 --- a/components/cloud/src/metrics.rs +++ b/components/cloud/src/metrics.rs @@ -10,4 +10,10 @@ lazy_static! { &["cloud", "req"] ) .unwrap(); + pub static ref CLOUD_ERROR_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_cloud_error_count", + "Total number of credentail errors from EKS env", + &["cloud", "error"] + ) + .unwrap(); } diff --git a/components/codec/Cargo.toml b/components/codec/Cargo.toml index 93e91209d66..8b00f077863 100644 --- a/components/codec/Cargo.toml +++ b/components/codec/Cargo.toml @@ -6,14 +6,14 @@ publish = false [dependencies] byteorder = "1.2" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } libc = "0.2" static_assertions = { version = "1.0", features = ["nightly"] } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } [dev-dependencies] bytes = "1.0" -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } protobuf = "2" rand = "0.8" diff --git a/components/codec/src/buffer.rs b/components/codec/src/buffer.rs index e19e66b91e1..f40ee1fae4f 100644 --- a/components/codec/src/buffer.rs +++ b/components/codec/src/buffer.rs @@ -23,11 +23,13 @@ pub trait BufferReader { /// TODO: We should make the panic behaviour deterministic. fn advance(&mut self, count: usize); - /// Read next several bytes as a slice and advance the position of internal cursor. + /// Read next several bytes as a slice and advance the position of internal + /// cursor. /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to read specified number of bytes. + /// Returns `Error::Io` if there is not enough space to read specified + /// number of bytes. fn read_bytes(&mut self, count: usize) -> Result<&[u8]>; } @@ -129,14 +131,16 @@ pub trait BufferWriter { /// The caller may hint the underlying buffer to grow according to `size` /// if the underlying buffer is dynamically sized (i.e. is capable to grow). /// - /// The size of the returned slice may be less than `size` given. For example, - /// when underlying buffer is fixed sized and there is no enough space any more. + /// The size of the returned slice may be less than `size` given. For + /// example, when underlying buffer is fixed sized and there is no + /// enough space any more. /// /// # Safety /// - /// The returned mutable slice is for writing only and should be never used for - /// reading since it might contain uninitialized memory when underlying buffer - /// is dynamically sized. For this reason, this function is marked `unsafe`. + /// The returned mutable slice is for writing only and should be never used + /// for reading since it might contain uninitialized memory when + /// underlying buffer is dynamically sized. For this reason, this + /// function is marked `unsafe`. unsafe fn bytes_mut(&mut self, size: usize) -> &mut [u8]; /// Advances the position of internal cursor for a previous write. @@ -339,7 +343,7 @@ mod tests { // Read more bytes than available buffer.set_position(39); - assert!(buffer.read_bytes(2).is_err()); + buffer.read_bytes(2).unwrap_err(); assert_eq!(buffer.position(), 39); assert_eq!(buffer.bytes(), &base[39..40]); } @@ -374,14 +378,14 @@ mod tests { assert_eq!(buffer, &base[21..40]); assert_eq!(buffer.bytes(), &base[21..40]); - assert!(buffer.read_bytes(20).is_err()); + buffer.read_bytes(20).unwrap_err(); buffer.advance(19); assert_eq!(buffer, &[]); assert_eq!(buffer.bytes(), &[]); assert_eq!(buffer.read_bytes(0).unwrap(), &[]); - assert!(buffer.read_bytes(1).is_err()); + buffer.read_bytes(1).unwrap_err(); } #[test] @@ -420,7 +424,7 @@ mod tests { assert_eq!(buffer.position(), 20); // Write more bytes than available size - assert!(buffer.write_bytes(&base_write[20..]).is_err()); + buffer.write_bytes(&base_write[20..]).unwrap_err(); assert_eq!(&buffer.get_ref()[0..20], &base_write[0..20]); assert_eq!(&buffer.get_ref()[20..], &base[20..]); assert_eq!(buffer.position(), 20); @@ -490,7 +494,6 @@ mod tests { let mut buffer = base.clone(); let mut buf_slice = buffer.as_mut_slice(); - // let buffer_viewer = std::slice::from_raw_parts(buffer as *const u8, buffer.len()); buf_slice.bytes_mut(13)[..13].clone_from_slice(&base_write[0..13]); assert_eq!(&buf_slice[0..13], &base_write[0..13]); @@ -519,7 +522,7 @@ mod tests { let mut buf_slice = &mut buffer[20..]; // Buffer remain 20, write 21 bytes shall fail. - assert!(buf_slice.write_bytes(&base_write[20..41]).is_err()); + buf_slice.write_bytes(&base_write[20..41]).unwrap_err(); // Write remaining 20 bytes buf_slice.bytes_mut(20)[..20].clone_from_slice(&base_write[20..40]); @@ -584,8 +587,8 @@ mod tests { } } - /// Test whether it is safe to store values in `Vec` after `len()`, i.e. during - /// reallocation these values are copied. + /// Test whether it is safe to store values in `Vec` after `len()`, + /// i.e. during reallocation these values are copied. #[test] // FIXME(#4331) Don't ignore this test. #[ignore] @@ -632,7 +635,6 @@ mod tests { // Re-allocate the vector space and ensure that the address is changed. vec.reserve(::std::cmp::max(payload_len * 3, 32)); - //assert_ne!(vec_ptr, vec.as_ptr()); if vec_ptr == vec.as_ptr() { in_place_reallocs += 1; } diff --git a/components/codec/src/byte.rs b/components/codec/src/byte.rs index 53b8091ac8c..8b5fd928edf 100644 --- a/components/codec/src/byte.rs +++ b/components/codec/src/byte.rs @@ -21,9 +21,9 @@ impl MemComparableByteCodec { (src_len / MEMCMP_GROUP_SIZE + 1) * (MEMCMP_GROUP_SIZE + 1) } - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the memory-comparable format. If the buffer is not complete, the length of buffer will be - /// returned. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the memory-comparable format. If the buffer is + /// not complete, the length of buffer will be returned. #[inline] fn get_first_encoded_len_internal(encoded: &[u8]) -> usize { let mut idx = MEMCMP_GROUP_SIZE; @@ -39,23 +39,25 @@ impl MemComparableByteCodec { } } - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the ascending memory-comparable format. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the ascending memory-comparable format. pub fn get_first_encoded_len(encoded: &[u8]) -> usize { Self::get_first_encoded_len_internal::(encoded) } - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the descending memory-comparable format. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the descending memory-comparable format. pub fn get_first_encoded_len_desc(encoded: &[u8]) -> usize { Self::get_first_encoded_len_internal::(encoded) } - /// Encodes all bytes in the `src` into `dest` in ascending memory-comparable format. + /// Encodes all bytes in the `src` into `dest` in ascending + /// memory-comparable format. /// /// Returns the number of bytes encoded. /// - /// `dest` must not overlaps `src`, otherwise encoded results will be incorrect. + /// `dest` must not overlaps `src`, otherwise encoded results will be + /// incorrect. /// /// # Panics /// @@ -99,7 +101,8 @@ impl MemComparableByteCodec { } } - /// Encodes the bytes `src[..len]` in ascending memory-comparable format in place. + /// Encodes the bytes `src[..len]` in ascending memory-comparable format in + /// place. /// /// Returns the number of bytes encoded. /// @@ -159,11 +162,13 @@ impl MemComparableByteCodec { } } - /// Encodes all bytes in the `src` into `dest` in descending memory-comparable format. + /// Encodes all bytes in the `src` into `dest` in descending + /// memory-comparable format. /// /// Returns the number of bytes encoded. /// - /// `dest` must not overlaps `src`, otherwise encoded results will be incorrect. + /// `dest` must not overlaps `src`, otherwise encoded results will be + /// incorrect. /// /// # Panics /// @@ -176,7 +181,8 @@ impl MemComparableByteCodec { encoded_len } - /// Encodes the bytes `src[..len]` in descending memory-comparable format in place. + /// Encodes the bytes `src[..len]` in descending memory-comparable format in + /// place. /// /// Returns the number of bytes encoded. /// @@ -189,21 +195,25 @@ impl MemComparableByteCodec { encoded_len } - /// Decodes bytes in ascending memory-comparable format in the `src` into `dest`. + /// Decodes bytes in ascending memory-comparable format in the `src` into + /// `dest`. /// - /// If there are multiple encoded byte slices in `src`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `src`, only the first one + /// will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read in - /// `src` and `written_bytes` is the number of bytes written in `dest`. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read in `src` and `written_bytes` is the number of bytes + /// written in `dest`. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// If `src == dest`, please use `try_decode_first_in_place`. /// /// # Panics /// - /// Panics if `dest.len() < src.len()`, although actual written data may be less. + /// Panics if `dest.len() < src.len()`, although actual written data may be + /// less. /// /// When there is a panic, `dest` may contain partially written data. /// @@ -223,21 +233,25 @@ impl MemComparableByteCodec { ) } - /// Decodes bytes in descending memory-comparable format in the `src` into `dest`. + /// Decodes bytes in descending memory-comparable format in the `src` into + /// `dest`. /// - /// If there are multiple encoded byte slices in `src`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `src`, only the first one + /// will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read in - /// `src` and `written_bytes` is the number of bytes written in `dest`. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read in `src` and `written_bytes` is the number of bytes + /// written in `dest`. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// If `src == dest`, please use `try_decode_first_in_place_desc`. /// /// # Panics /// - /// Panics if `dest.len() < src.len()`, although actual written data may be less. + /// Panics if `dest.len() < src.len()`, although actual written data may be + /// less. /// /// When there is a panic, `dest` may contain partially written data. /// @@ -259,16 +273,17 @@ impl MemComparableByteCodec { Ok((read_bytes, written_bytes)) } - /// Decodes bytes in ascending memory-comparable format in place, i.e. decoded data will - /// overwrite the encoded data. + /// Decodes bytes in ascending memory-comparable format in place, i.e. + /// decoded data will overwrite the encoded data. /// - /// If there are multiple encoded byte slices in `buffer`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `buffer`, only the first + /// one will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read - /// and `written_bytes` is the number of bytes written. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read and `written_bytes` is the number of bytes written. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// # Errors /// @@ -286,16 +301,17 @@ impl MemComparableByteCodec { ) } - /// Decodes bytes in descending memory-comparable format in place, i.e. decoded data will - /// overwrite the encoded data. + /// Decodes bytes in descending memory-comparable format in place, i.e. + /// decoded data will overwrite the encoded data. /// - /// If there are multiple encoded byte slices in `buffer`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `buffer`, only the first + /// one will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read - /// and `written_bytes` is the number of bytes written. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read and `written_bytes` is the number of bytes written. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// # Errors /// @@ -323,10 +339,12 @@ impl MemComparableByteCodec { /// /// This function uses pointers to accept the scenario that `src == dest`. /// - /// This function also uses generics to specialize different code path for ascending and - /// descending decoding, which performs better than inlining a flag. + /// This function also uses generics to specialize different code path for + /// ascending and descending decoding, which performs better than + /// inlining a flag. /// - /// Please refer to `try_decode_first` for the meaning of return values, panics and errors. + /// Please refer to `try_decode_first` for the meaning of return values, + /// panics and errors. #[inline] fn try_decode_first_internal( mut src_ptr: *const u8, @@ -395,7 +413,8 @@ impl MemComparableByteCodec { trait MemComparableCodecHelper { const PADDING: [u8; MEMCMP_GROUP_SIZE]; - /// Given a raw padding size byte, interprets the padding size according to correct order. + /// Given a raw padding size byte, interprets the padding size according to + /// correct order. fn parse_padding_size(raw_marker: u8) -> usize; } @@ -476,8 +495,9 @@ impl MemComparableByteDecoder for T {} pub struct CompactByteCodec; impl CompactByteCodec { - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the compact format. If the buffer is not complete, the length of buffer will be returned. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the compact format. If the buffer is not complete, + /// the length of buffer will be returned. pub fn get_first_encoded_len(encoded: &[u8]) -> usize { let result = NumberCodec::try_decode_var_i64(encoded); match result { @@ -739,7 +759,7 @@ mod tests { for (exp, encoded) in cases { let mut path = env::temp_dir(); path.push("read-compact-codec-file"); - fs::write(&path, &encoded).unwrap(); + fs::write(&path, encoded).unwrap(); let f = File::open(&path).unwrap(); let mut rdr = BufReader::new(f); let decoded = rdr.read_compact_bytes().unwrap(); @@ -951,7 +971,7 @@ mod tests { let result = panic_hook::recover_safe(move || { let _ = MemComparableByteCodec::encode_all(src.as_slice(), dest.as_mut_slice()); }); - assert!(result.is_err()); + result.unwrap_err(); let mut src_in_place = vec![0; dest_len]; let result = panic_hook::recover_safe(move || { @@ -960,7 +980,7 @@ mod tests { src_len, ); }); - assert!(result.is_err()); + result.unwrap_err(); } } @@ -968,8 +988,9 @@ mod tests { fn test_memcmp_try_decode_first() { use super::MEMCMP_GROUP_SIZE as N; - // We have ensured correctness in `test_memcmp_encode_all`, so we use `encode_all` to - // generate fixtures in different length, used for decoding. + // We have ensured correctness in `test_memcmp_encode_all`, so we use + // `encode_all` to generate fixtures in different length, used for + // decoding. fn do_test( is_desc: bool, @@ -1120,7 +1141,7 @@ mod tests { invalid_src.as_slice(), dest.as_mut_slice(), ); - assert!(result.is_err()); + result.unwrap_err(); } } @@ -1141,7 +1162,7 @@ mod tests { dest.as_mut_slice(), ); }); - assert!(result.is_err()); + result.unwrap_err(); } { let mut dest = vec![0; src.len()]; diff --git a/components/codec/src/error.rs b/components/codec/src/error.rs index 2483bd541de..09118824c6b 100644 --- a/components/codec/src/error.rs +++ b/components/codec/src/error.rs @@ -13,6 +13,8 @@ pub enum ErrorInner { #[error("Data padding is incorrect")] BadPadding, + #[error("key not found")] + KeyNotFound, } impl ErrorInner { @@ -27,8 +29,7 @@ impl ErrorInner { } } -// ====== The code below is to box the error so that the it can be as small as possible ====== - +// Box the error so that the it can be as small as possible #[derive(Debug, Error)] #[error(transparent)] pub struct Error(#[from] pub Box); @@ -57,6 +58,7 @@ impl ErrorCodeExt for Error { match self.0.as_ref() { ErrorInner::Io(_) => error_code::codec::IO, ErrorInner::BadPadding => error_code::codec::BAD_PADDING, + ErrorInner::KeyNotFound => error_code::codec::KEY_NOT_FOUND, } } } diff --git a/components/codec/src/number.rs b/components/codec/src/number.rs index 4cc114e7ea7..af47905334d 100644 --- a/components/codec/src/number.rs +++ b/components/codec/src/number.rs @@ -403,7 +403,8 @@ impl NumberCodec { } /// Encodes an unsigned 64 bit integer `v` to `buf` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: VarInt encoding is slow, try avoid using it. /// @@ -429,13 +430,15 @@ impl NumberCodec { } /// Decodes an unsigned 64 bit integer from `buf` in VarInt encoding. - /// Returns decoded result and the number of bytes that successfully decoded. + /// Returns decoded result and the number of bytes that successfully + /// decoded. /// /// This function is more efficient when `buf.len() >= 10`. /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. pub fn try_decode_var_u64(buf: &[u8]) -> Result<(u64, usize)> { #[allow(clippy::cast_lossless)] unsafe { @@ -478,7 +481,8 @@ impl NumberCodec { } /// Encodes a signed 64 bit integer `v` to `buf` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: VarInt encoding is slow, try avoid using it. /// @@ -495,13 +499,15 @@ impl NumberCodec { } /// Decodes a signed 64 bit integer from `buf` in VarInt encoding. - /// Returns decoded result and the number of bytes that successfully decoded. + /// Returns decoded result and the number of bytes that successfully + /// decoded. /// /// This function is more efficient when `buf.len() >= 10`. /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. #[inline] pub fn try_decode_var_i64(buf: &[u8]) -> Result<(i64, usize)> { let (uv, decoded_bytes) = Self::try_decode_var_u64(buf)?; @@ -514,8 +520,8 @@ impl NumberCodec { } } - /// Gets the length of the first encoded VarInt in the given buffer. If the buffer is not - /// complete, the length of buffer will be returned. + /// Gets the length of the first encoded VarInt in the given buffer. If the + /// buffer is not complete, the length of buffer will be returned. /// /// This function is more efficient when `buf.len() >= 10`. pub fn get_first_encoded_var_int_len(buf: &[u8]) -> usize { @@ -761,7 +767,8 @@ pub trait NumberDecoder: BufferReader { /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. #[inline] fn read_var_u64(&mut self) -> Result { let (v, decoded_bytes) = { @@ -779,7 +786,8 @@ pub trait NumberDecoder: BufferReader { /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. #[inline] fn read_var_i64(&mut self) -> Result { let (v, decoded_bytes) = { @@ -1015,11 +1023,13 @@ pub trait NumberEncoder: BufferWriter { } /// Writes an unsigned 64 bit integer `v` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: /// - VarInt encoding is slow, try avoid using it. - /// - The buffer must reserve 10 bytes for writing, although actual written bytes may be less. + /// - The buffer must reserve 10 bytes for writing, although actual written + /// bytes may be less. /// - The buffer will be advanced by actual written bytes. /// /// # Errors @@ -1039,11 +1049,13 @@ pub trait NumberEncoder: BufferWriter { } /// Writes a signed 64 bit integer `v` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: /// - VarInt encoding is slow, try avoid using it. - /// - The buffer must reserve 10 bytes for writing, although actual written bytes may be less. + /// - The buffer must reserve 10 bytes for writing, although actual written + /// bytes may be less. /// - The buffer will be advanced by actual written bytes. /// /// # Errors @@ -1818,7 +1830,8 @@ mod benches { use crate::ErrorInner; - /// Encode u64 little endian using `NumberCodec` and store position in extra variable. + /// Encode u64 little endian using `NumberCodec` and store position in extra + /// variable. #[bench] fn bench_encode_u64_le_number_codec(b: &mut test::Bencher) { let mut buf: [u8; 10] = [0; 10]; @@ -1834,7 +1847,8 @@ mod benches { }); } - /// Encode u64 little endian using `byteorder::WriteBytesExt` over a `Cursor<&mut [u8]>`. + /// Encode u64 little endian using `byteorder::WriteBytesExt` over a + /// `Cursor<&mut [u8]>`. #[bench] fn bench_encode_u64_le_byteorder(b: &mut test::Bencher) { use byteorder::WriteBytesExt; @@ -1852,7 +1866,8 @@ mod benches { }); } - /// Encode u64 little endian using `NumberEncoder` over a `Cursor<&mut [u8]>`. + /// Encode u64 little endian using `NumberEncoder` over a `Cursor<&mut + /// [u8]>`. #[bench] fn bench_encode_u64_le_buffer_encoder_slice(b: &mut test::Bencher) { use super::NumberEncoder; @@ -1881,7 +1896,8 @@ mod benches { }); } - /// Decode u64 little endian using `NumberCodec` and store position in extra variable. + /// Decode u64 little endian using `NumberCodec` and store position in extra + /// variable. #[bench] fn bench_decode_u64_le_number_codec(b: &mut test::Bencher) { let buf: [u8; 10] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]; @@ -1894,7 +1910,8 @@ mod benches { }); } - /// Decode u64 little endian using `NumberCodec` and store position via slice index. + /// Decode u64 little endian using `NumberCodec` and store position via + /// slice index. #[bench] fn bench_decode_u64_le_number_codec_over_slice(b: &mut test::Bencher) { let buf: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0]; @@ -1907,7 +1924,8 @@ mod benches { }); } - /// Decode u64 little endian using `byteorder::ReadBytesExt` over a `Cursor<&[u8]>`. + /// Decode u64 little endian using `byteorder::ReadBytesExt` over a + /// `Cursor<&[u8]>`. #[bench] fn bench_decode_u64_le_byteorder(b: &mut test::Bencher) { use byteorder::ReadBytesExt; diff --git a/components/collections/Cargo.toml b/components/collections/Cargo.toml index a94cb0216cf..dca0afbc2c8 100644 --- a/components/collections/Cargo.toml +++ b/components/collections/Cargo.toml @@ -6,4 +6,4 @@ publish = false [dependencies] fxhash = "0.2.1" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index b6e382d7f14..b391c1d239a 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -5,24 +5,19 @@ publish = false version = "0.0.1" [dependencies] +crossbeam-skiplist = "0.1" fail = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } parking_lot = "0.12" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } -txn_types = { path = "../txn_types", default-features = false } - -# FIXME: switch to the crates.io version after crossbeam-skiplist is released -[dependencies.crossbeam-skiplist] -git = "https://github.com/tikv/crossbeam.git" -branch = "tikv-5.0" -package = "crossbeam-skiplist" +txn_types = { workspace = true } [dev-dependencies] criterion = "0.3" futures = "0.3" rand = "0.8.3" -tikv_alloc = { path = "../tikv_alloc", features = ["jemalloc"] } +tikv_alloc = { workspace = true, features = ["jemalloc"] } [[bench]] name = "lock_table" diff --git a/components/concurrency_manager/benches/lock_table.rs b/components/concurrency_manager/benches/lock_table.rs index f2d4a9b92c9..52c9bea960a 100644 --- a/components/concurrency_manager/benches/lock_table.rs +++ b/components/concurrency_manager/benches/lock_table.rs @@ -1,7 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] -#![feature(bench_black_box)] use std::{borrow::Cow, hint::black_box, mem::forget}; diff --git a/components/concurrency_manager/src/key_handle.rs b/components/concurrency_manager/src/key_handle.rs index f34b29b0f37..c7aebbc49e0 100644 --- a/components/concurrency_manager/src/key_handle.rs +++ b/components/concurrency_manager/src/key_handle.rs @@ -39,7 +39,7 @@ impl KeyHandle { } pub fn with_lock(&self, f: impl FnOnce(&Option) -> T) -> T { - f(&*self.lock_store.lock()) + f(&self.lock_store.lock()) } /// Set the LockTable that the KeyHandle is in. @@ -80,7 +80,7 @@ impl KeyHandleGuard { } pub fn with_lock(&self, f: impl FnOnce(&mut Option) -> T) -> T { - f(&mut *self.handle.lock_store.lock()) + f(&mut self.handle.lock_store.lock()) } pub(crate) fn handle(&self) -> &Arc { diff --git a/components/concurrency_manager/src/lib.rs b/components/concurrency_manager/src/lib.rs index 7865f43fc78..342f2139e08 100644 --- a/components/concurrency_manager/src/lib.rs +++ b/components/concurrency_manager/src/lib.rs @@ -58,8 +58,8 @@ impl ConcurrencyManager { } } - /// Acquires a mutex of the key and returns an RAII guard. When the guard goes - /// out of scope, the mutex will be unlocked. + /// Acquires a mutex of the key and returns an RAII guard. When the guard + /// goes out of scope, the mutex will be unlocked. /// /// The guard can be used to store Lock in the table. The stored lock /// is visible to `read_key_check` and `read_range_check`. @@ -67,8 +67,8 @@ impl ConcurrencyManager { self.lock_table.lock_key(key).await } - /// Acquires mutexes of the keys and returns the RAII guards. The order of the - /// guards is the same with the given keys. + /// Acquires mutexes of the keys and returns the RAII guards. The order of + /// the guards is the same with the given keys. /// /// The guards can be used to store Lock in the table. The stored lock /// is visible to `read_key_check` and `read_range_check`. @@ -137,7 +137,8 @@ mod tests { let concurrency_manager = ConcurrencyManager::new(1.into()); let keys: Vec<_> = [b"c", b"a", b"b"] .iter() - .map(|k| Key::from_raw(*k)) + .copied() + .map(|k| Key::from_raw(k)) .collect(); let guards = concurrency_manager.lock_keys(keys.iter()).await; for (key, guard) in keys.iter().zip(&guards) { @@ -181,8 +182,9 @@ mod tests { vec![20, 40, 30], vec![30, 20, 40], ]; - let keys: Vec<_> = vec![b"a", b"b", b"c"] - .into_iter() + let keys: Vec<_> = [b"a", b"b", b"c"] + .iter() + .copied() .map(|k| Key::from_raw(k)) .collect(); diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index 2b9e87f8f39..ad013a863a1 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -33,12 +33,13 @@ impl LockTable { let entry = self.0.get_or_insert(key.clone(), weak); if entry.value().ptr_eq(&weak2) { // If the weak ptr returned by `get_or_insert` equals to the one we inserted, - // `guard` refers to the KeyHandle in the lock table. Now, we can bind the handle - // to the table. + // `guard` refers to the KeyHandle in the lock table. Now, we can bind the + // handle to the table. - // SAFETY: The `table` field in `KeyHandle` is only accessed through the `set_table` - // or the `drop` method. It's impossible to have a concurrent `drop` here and `set_table` - // is only called here. So there is no concurrent access to the `table` field in `KeyHandle`. + // SAFETY: The `table` field in `KeyHandle` is only accessed through the + // `set_table` or the `drop` method. It's impossible to have a concurrent `drop` + // here and `set_table` is only called here. So there is no concurrent access to + // the `table` field in `KeyHandle`. unsafe { guard.handle().set_table(self.clone()); } @@ -56,7 +57,7 @@ impl LockTable { ) -> Result<(), E> { if let Some(lock_ref) = self.get(key) { return lock_ref.with_lock(|lock| { - if let Some(lock) = &*lock { + if let Some(lock) = lock { return check_fn(lock); } Ok(()) @@ -157,9 +158,9 @@ mod test { assert_eq!(counter.load(Ordering::SeqCst), 100); } - fn ts_check(lock: &Lock, ts: u64) -> Result<(), Lock> { + fn ts_check(lock: &Lock, ts: u64) -> Result<(), Box> { if lock.ts.into_inner() < ts { - Err(lock.clone()) + Err(Box::new(lock.clone())) } else { Ok(()) } @@ -171,7 +172,7 @@ mod test { let key_k = Key::from_raw(b"k"); // no lock found - assert!(lock_table.check_key(&key_k, |_| Err(())).is_ok()); + lock_table.check_key(&key_k, |_| Err(())).unwrap(); let lock = Lock::new( LockType::Lock, @@ -189,10 +190,13 @@ mod test { }); // lock passes check_fn - assert!(lock_table.check_key(&key_k, |l| ts_check(l, 5)).is_ok()); + lock_table.check_key(&key_k, |l| ts_check(l, 5)).unwrap(); // lock does not pass check_fn - assert_eq!(lock_table.check_key(&key_k, |l| ts_check(l, 20)), Err(lock)); + assert_eq!( + lock_table.check_key(&key_k, |l| ts_check(l, 20)), + Err(Box::new(lock)) + ); } #[tokio::test] @@ -230,33 +234,29 @@ mod test { }); // no lock found - assert!( - lock_table - .check_range( - Some(&Key::from_raw(b"m")), - Some(&Key::from_raw(b"n")), - |_, _| Err(()) - ) - .is_ok() - ); + lock_table + .check_range( + Some(&Key::from_raw(b"m")), + Some(&Key::from_raw(b"n")), + |_, _| Err(()), + ) + .unwrap(); // lock passes check_fn - assert!( - lock_table - .check_range(None, Some(&Key::from_raw(b"z")), |_, l| ts_check(l, 5)) - .is_ok() - ); + lock_table + .check_range(None, Some(&Key::from_raw(b"z")), |_, l| ts_check(l, 5)) + .unwrap(); // first lock does not pass check_fn assert_eq!( lock_table.check_range(Some(&Key::from_raw(b"a")), None, |_, l| ts_check(l, 25)), - Err(lock_k) + Err(Box::new(lock_k)) ); // first lock passes check_fn but the second does not assert_eq!( lock_table.check_range(None, None, |_, l| ts_check(l, 15)), - Err(lock_l) + Err(Box::new(lock_l)) ); } diff --git a/components/concurrency_manager/tests/memory_usage.rs b/components/concurrency_manager/tests/memory_usage.rs index b3b62ab5849..34ce9986a61 100644 --- a/components/concurrency_manager/tests/memory_usage.rs +++ b/components/concurrency_manager/tests/memory_usage.rs @@ -11,7 +11,8 @@ use rand::prelude::*; use txn_types::{Key, Lock, LockType}; // This test is heavy so we shouldn't run it daily. -// Run it with the following command (recommending release mode) and see the printed stats: +// Run it with the following command (recommending release mode) and see the +// printed stats: // // ``` // cargo test --package concurrency_manager --test memory_usage --features jemalloc --release -- test_memory_usage --exact --ignored --nocapture diff --git a/components/coprocessor_plugin_api/src/allocator.rs b/components/coprocessor_plugin_api/src/allocator.rs index 7d7140b6170..d8c2ab5062f 100644 --- a/components/coprocessor_plugin_api/src/allocator.rs +++ b/components/coprocessor_plugin_api/src/allocator.rs @@ -9,8 +9,8 @@ type DeallocFn = unsafe fn(*mut u8, Layout); /// Used to initialize the plugin's allocator. /// -/// A `HostAllocatorPtr` contains the relevant pointers to initialize the allocator of -/// to plugin. It will be passed from TiKV to the plugin. +/// A `HostAllocatorPtr` contains the relevant pointers to initialize the +/// allocator of to plugin. It will be passed from TiKV to the plugin. #[repr(C)] pub struct HostAllocatorPtr { pub alloc_fn: AllocFn, @@ -26,8 +26,9 @@ pub struct HostAllocator { impl HostAllocator { /// Creates a new [`HostAllocator`]. /// - /// The internal function pointers are initially `None`, so any attempt to allocate memory - /// before a call to [`set_allocator()`] will result in a panic. + /// The internal function pointers are initially `None`, so any attempt to + /// allocate memory before a call to [`set_allocator()`] will result in + /// a panic. pub const fn new() -> Self { HostAllocator { alloc_fn: Atomic::new(None), @@ -35,9 +36,10 @@ impl HostAllocator { } } - /// Updates the function pointers of the [`HostAllocator`] to the given [`HostAllocatorPtr`]. - /// This function needs to be called before _any_ allocation with this allocator is performed, - /// because otherwise the [`HostAllocator`] is in an invalid state. + /// Updates the function pointers of the [`HostAllocator`] to the given + /// [`HostAllocatorPtr`]. This function needs to be called before _any_ + /// allocation with this allocator is performed, because otherwise the + /// [`HostAllocator`] is in an invalid state. pub fn set_allocator(&self, allocator: HostAllocatorPtr) { self.alloc_fn .store(Some(allocator.alloc_fn), Ordering::SeqCst); diff --git a/components/coprocessor_plugin_api/src/errors.rs b/components/coprocessor_plugin_api/src/errors.rs index 7085fa98edd..78961d60df8 100644 --- a/components/coprocessor_plugin_api/src/errors.rs +++ b/components/coprocessor_plugin_api/src/errors.rs @@ -9,9 +9,10 @@ pub type PluginResult = std::result::Result; /// Error returned by operations on [`RawStorage`]. /// -/// If a plugin wants to return a custom error, e.g. an error in the business logic, the plugin should -/// return an appropriately encoded error in [`RawResponse`]; in other words, plugins are responsible -/// for their error handling by themselves. +/// If a plugin wants to return a custom error, e.g. an error in the business +/// logic, the plugin should return an appropriately encoded error in +/// [`RawResponse`]; in other words, plugins are responsible for their error +/// handling by themselves. #[derive(Debug)] pub enum PluginError { KeyNotInRegion { @@ -23,11 +24,12 @@ pub enum PluginError { Timeout(Duration), Canceled, - /// Errors that can not be handled by a coprocessor plugin but should instead be returned to the - /// client. + /// Errors that can not be handled by a coprocessor plugin but should + /// instead be returned to the client. /// - /// If such an error appears, plugins can run some cleanup code and return early from the - /// request. The error will be passed to the client and the client might retry the request. + /// If such an error appears, plugins can run some cleanup code and return + /// early from the request. The error will be passed to the client and + /// the client might retry the request. Other(String, Box), } diff --git a/components/coprocessor_plugin_api/src/lib.rs b/components/coprocessor_plugin_api/src/lib.rs index 6e90ef83d2a..7f05840c072 100644 --- a/components/coprocessor_plugin_api/src/lib.rs +++ b/components/coprocessor_plugin_api/src/lib.rs @@ -1,26 +1,30 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(const_fn_fn_ptr_basics)] -//! This crate contains some necessary types and traits for implementing a custom coprocessor plugin -//! for TiKV. +//! This crate contains some necessary types and traits for implementing a +//! custom coprocessor plugin for TiKV. //! -//! Most notably, if you want to write a custom plugin, your plugin needs to implement the -//! [`CoprocessorPlugin`] trait. The plugin then needs to be compiled to a `dylib`. +//! Most notably, if you want to write a custom plugin, your plugin needs to +//! implement the [`CoprocessorPlugin`] trait. The plugin then needs to be +//! compiled to a `dylib`. //! -//! > Note: Only `dylib` is supported, and not `cdylib` or `staticlib`, because the latter two are -//! > not able to use TiKV's allocator. See also the documentation in [`std::alloc`]. +//! > Note: Only `dylib` is supported, and not `cdylib` or `staticlib`, because +//! > the latter two are +//! > not able to use TiKV's allocator. See also the documentation in +//! > [`std::alloc`]. //! -//! In order to make your plugin callable, you need to declare a constructor with the -//! [`declare_plugin`] macro. +//! In order to make your plugin callable, you need to declare a constructor +//! with the [`declare_plugin`] macro. //! -//! A plugin can interact with the underlying storage via the [`RawStorage`] trait. +//! A plugin can interact with the underlying storage via the [`RawStorage`] +//! trait. //! //! # Example //! //! ```no_run -//! use coprocessor_plugin_api::*; //! use std::ops::Range; //! +//! use coprocessor_plugin_api::*; +//! //! #[derive(Default)] //! struct MyPlugin; //! diff --git a/components/coprocessor_plugin_api/src/plugin_api.rs b/components/coprocessor_plugin_api/src/plugin_api.rs index 31f87f3c822..f31c3f9bab2 100644 --- a/components/coprocessor_plugin_api/src/plugin_api.rs +++ b/components/coprocessor_plugin_api/src/plugin_api.rs @@ -7,31 +7,32 @@ use crate::PluginResult; /// Raw bytes of the request payload from the client to the coprocessor. pub type RawRequest = Vec; -/// The response from the coprocessor encoded as raw bytes that are sent back to the client. +/// The response from the coprocessor encoded as raw bytes that are sent back to +/// the client. pub type RawResponse = Vec; /// A plugin that allows users to execute arbitrary code on TiKV nodes. /// -/// If you want to implement a custom coprocessor plugin for TiKV, your plugin needs to implement -/// the [`CoprocessorPlugin`] trait. +/// If you want to implement a custom coprocessor plugin for TiKV, your plugin +/// needs to implement the [`CoprocessorPlugin`] trait. /// -/// Plugins can run setup code in their constructor and teardown code by implementing -/// [`std::ops::Drop`]. +/// Plugins can run setup code in their constructor and teardown code by +/// implementing [`std::ops::Drop`]. pub trait CoprocessorPlugin: Send + Sync { /// Handles a request to the coprocessor. /// - /// The data in the `request` parameter is exactly the same data that was passed with the - /// `RawCoprocessorRequest` in the `data` field. Each plugin is responsible to properly decode - /// the raw bytes by itself. - /// The same is true for the return parameter of this function. Upon successful completion, the - /// function should return a properly encoded result as raw bytes which is then sent back to - /// the client. + /// The data in the `request` parameter is exactly the same data that was + /// passed with the `RawCoprocessorRequest` in the `data` field. Each + /// plugin is responsible to properly decode the raw bytes by itself. + /// The same is true for the return parameter of this function. Upon + /// successful completion, the function should return a properly encoded + /// result as raw bytes which is then sent back to the client. /// - /// Most of the time, it's a good idea to use Protobuf for encoding/decoding, but in general you - /// can also send raw bytes. + /// Most of the time, it's a good idea to use Protobuf for + /// encoding/decoding, but in general you can also send raw bytes. /// - /// Plugins can read and write data from the underlying [`RawStorage`] via the `storage` - /// parameter. + /// Plugins can read and write data from the underlying [`RawStorage`] via + /// the `storage` parameter. fn on_raw_coprocessor_request( &self, ranges: Vec>, diff --git a/components/coprocessor_plugin_api/src/storage_api.rs b/components/coprocessor_plugin_api/src/storage_api.rs index 3adfa7c4a7e..08c09ca4a48 100644 --- a/components/coprocessor_plugin_api/src/storage_api.rs +++ b/components/coprocessor_plugin_api/src/storage_api.rs @@ -15,38 +15,44 @@ pub type KvPair = (Key, Value); /// Storage access for coprocessor plugins. /// -/// [`RawStorage`] allows coprocessor plugins to interact with TiKV storage on a low level. +/// [`RawStorage`] allows coprocessor plugins to interact with TiKV storage on a +/// low level. /// /// Batch operations should be preferred due to their better performance. #[async_trait(?Send)] pub trait RawStorage { - /// Retrieves the value for a given key from the storage on the current node. - /// Returns [`Option::None`] if the key is not present in the database. + /// Retrieves the value for a given key from the storage on the current + /// node. Returns [`Option::None`] if the key is not present in the + /// database. async fn get(&self, key: Key) -> PluginResult>; - /// Same as [`RawStorage::get()`], but retrieves values for multiple keys at once. + /// Same as [`RawStorage::get()`], but retrieves values for multiple keys at + /// once. async fn batch_get(&self, keys: Vec) -> PluginResult>; - /// Same as [`RawStorage::get()`], but accepts a `key_range` such that values for keys in - /// `[key_range.start, key_range.end)` are retrieved. + /// Same as [`RawStorage::get()`], but accepts a `key_range` such that + /// values for keys in `[key_range.start, key_range.end)` are retrieved. /// The upper bound of the `key_range` is exclusive. async fn scan(&self, key_range: Range) -> PluginResult>; /// Inserts a new key-value pair into the storage on the current node. async fn put(&self, key: Key, value: Value) -> PluginResult<()>; - /// Same as [`RawStorage::put()`], but inserts multiple key-value pairs at once. + /// Same as [`RawStorage::put()`], but inserts multiple key-value pairs at + /// once. async fn batch_put(&self, kv_pairs: Vec) -> PluginResult<()>; - /// Deletes a key-value pair from the storage on the current node given a `key`. - /// Returns [`Result::Ok]` if the key was successfully deleted. + /// Deletes a key-value pair from the storage on the current node given a + /// `key`. Returns [`Result::Ok]` if the key was successfully deleted. async fn delete(&self, key: Key) -> PluginResult<()>; - /// Same as [`RawStorage::delete()`], but deletes multiple key-value pairs at once. + /// Same as [`RawStorage::delete()`], but deletes multiple key-value pairs + /// at once. async fn batch_delete(&self, keys: Vec) -> PluginResult<()>; - /// Same as [`RawStorage::delete()`], but deletes multiple key-values pairs at once - /// given a `key_range`. All records with keys in `[key_range.start, key_range.end)` - /// will be deleted. The upper bound of the `key_range` is exclusive. + /// Same as [`RawStorage::delete()`], but deletes multiple key-values pairs + /// at once given a `key_range`. All records with keys in + /// `[key_range.start, key_range.end)` will be deleted. The upper bound + /// of the `key_range` is exclusive. async fn delete_range(&self, key_range: Range) -> PluginResult<()>; } diff --git a/components/coprocessor_plugin_api/src/util.rs b/components/coprocessor_plugin_api/src/util.rs index fd15a26a1c8..606082c0c4e 100644 --- a/components/coprocessor_plugin_api/src/util.rs +++ b/components/coprocessor_plugin_api/src/util.rs @@ -2,33 +2,40 @@ use super::{allocator::HostAllocatorPtr, plugin_api::CoprocessorPlugin}; -/// Name of the exported constructor with signature [`PluginConstructorSignature`] for the plugin. +/// Name of the exported constructor with signature +/// [`PluginConstructorSignature`] for the plugin. pub static PLUGIN_CONSTRUCTOR_SYMBOL: &[u8] = b"_plugin_create"; -/// Name of the exported function with signature [`PluginGetBuildInfoSignature`] to get build -/// information about the plugin. +/// Name of the exported function with signature [`PluginGetBuildInfoSignature`] +/// to get build information about the plugin. pub static PLUGIN_GET_BUILD_INFO_SYMBOL: &[u8] = b"_plugin_get_build_info"; -/// Name of the exported function with signature [`PluginGetPluginInfoSignature`] to get some -/// information about the plugin. +/// Name of the exported function with signature +/// [`PluginGetPluginInfoSignature`] to get some information about the plugin. pub static PLUGIN_GET_PLUGIN_INFO_SYMBOL: &[u8] = b"_plugin_get_plugin_info"; -/// Type signature of the exported function with symbol [`PLUGIN_CONSTRUCTOR_SYMBOL`]. +/// Type signature of the exported function with symbol +/// [`PLUGIN_CONSTRUCTOR_SYMBOL`]. pub type PluginConstructorSignature = unsafe fn(host_allocator: HostAllocatorPtr) -> *mut dyn CoprocessorPlugin; -/// Type signature of the exported function with symbol [`PLUGIN_GET_BUILD_INFO_SYMBOL`]. +/// Type signature of the exported function with symbol +/// [`PLUGIN_GET_BUILD_INFO_SYMBOL`]. pub type PluginGetBuildInfoSignature = extern "C" fn() -> BuildInfo; -/// Type signature of the exported function with symbol [`PLUGIN_GET_PLUGIN_INFO_SYMBOL`]. +/// Type signature of the exported function with symbol +/// [`PLUGIN_GET_PLUGIN_INFO_SYMBOL`]. pub type PluginGetPluginInfoSignature = extern "C" fn() -> PluginInfo; -/// Automatically collected build information about the plugin that is exposed from the library. +/// Automatically collected build information about the plugin that is exposed +/// from the library. /// -/// Will be automatically created when using [`declare_plugin!(...)`](declare_plugin) and will be -/// used by TiKV when a plugin is loaded to determine whether there are compilation mismatches. +/// Will be automatically created when using +/// [`declare_plugin!(...)`](declare_plugin) and will be used by TiKV when a +/// plugin is loaded to determine whether there are compilation mismatches. #[repr(C)] -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub struct BuildInfo { - /// Version of the [`coprocessor_plugin_api`](crate) crate that was used to compile this plugin. + /// Version of the [`coprocessor_plugin_api`](crate) crate that was used to + /// compile this plugin. pub api_version: &'static str, /// Target triple for which platform this plugin was compiled. pub target: &'static str, @@ -48,7 +55,7 @@ impl BuildInfo { /// Information about the plugin, like its name and version. #[repr(C)] -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub struct PluginInfo { /// The name of the plugin. pub name: &'static str, @@ -59,11 +66,15 @@ pub struct PluginInfo { /// Declare a plugin for the library so that it can be loaded by TiKV. /// /// The macro has three different versions: -/// * `declare_plugin!(plugin_name, plugin_version, plugin_ctor)` which gives you full control. -/// * `declare_plugin!(plugin_name, plugin_ctor)` automatically fetches the version from `Cargo.toml`. -/// * `declare_plugin!(plugin_ctor)` automatically fetches plugin name and version from `Cargo.toml`. +/// * `declare_plugin!(plugin_name, plugin_version, plugin_ctor)` which gives +/// you full control. +/// * `declare_plugin!(plugin_name, plugin_ctor)` automatically fetches the +/// version from `Cargo.toml`. +/// * `declare_plugin!(plugin_ctor)` automatically fetches plugin name and +/// version from `Cargo.toml`. /// -/// The types of `plugin_name` and `plugin_version` have to be `&'static str` literals. +/// The types of `plugin_name` and `plugin_version` have to be `&'static str` +/// literals. /// /// # Notes /// This works by automatically generating an `extern "C"` function with a @@ -119,8 +130,8 @@ macro_rules! declare_plugin { /// Transforms the name of a package into the name of the compiled library. /// -/// The result of the function can be used to correctly locate build artifacts of `dylib` on -/// different platforms. +/// The result of the function can be used to correctly locate build artifacts +/// of `dylib` on different platforms. /// /// The name of the `dylib` is /// * `lib.so` on Linux diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index 80ad86b3b75..deac60223a7 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -14,32 +14,33 @@ bytes = "1.0" crc32fast = "1.2" crossbeam = "0.8" derive_more = "0.99.3" -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } +engine_traits = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["std", "io"] } hex = "0.4.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" -online_config = { path = "../online_config" } +online_config = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } rand = "0.8" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt"] } +walkdir = "2" [dev-dependencies] matches = "0.1.8" tempfile = "3.1" -test_util = { path = "../test_util", default-features = false } +test_util = { workspace = true } toml = "0.5" diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index 2fe0b0cb55a..164ea312e5d 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -12,20 +12,20 @@ cloud-azure = [] [dependencies] async-trait = "0.1" -aws = { path = "../../cloud/aws", optional = true, default-features = false } -cloud = { path = "../../cloud/", default-features = false } +aws = { workspace = true, optional = true } +cloud = { workspace = true } derive_more = "0.99.3" -encryption = { path = "../", default-features = false } -error_code = { path = "../../error_code", default-features = false } -file_system = { path = "../../file_system", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +encryption = { workspace = true } +error_code = { workspace = true } +file_system = { workspace = true } +kvproto = { workspace = true } openssl = "0.10" protobuf = { version = "2.8", features = ["bytes"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util", default-features = false } +slog-global = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] rust-ini = "0.14.0" -structopt = "0.3" \ No newline at end of file +structopt = "0.3" diff --git a/components/encryption/export/examples/ecli.rs b/components/encryption/export/examples/ecli.rs index d9d2bcb8098..ed2247cc77c 100644 --- a/components/encryption/export/examples/ecli.rs +++ b/components/encryption/export/examples/ecli.rs @@ -3,7 +3,7 @@ use std::io::{Read, Write}; pub use cloud::kms::Config as CloudConfig; -#[cfg(feature = "aws")] +#[cfg(feature = "cloud-aws")] use encryption_export::{create_cloud_backend, KmsConfig}; use encryption_export::{Backend, Error, Result}; use file_system::{File, OpenOptions}; diff --git a/components/encryption/export/src/lib.rs b/components/encryption/export/src/lib.rs index 5b84a4a0c34..be86db83082 100644 --- a/components/encryption/export/src/lib.rs +++ b/components/encryption/export/src/lib.rs @@ -14,8 +14,9 @@ use derive_more::Deref; #[cfg(feature = "cloud-aws")] pub use encryption::KmsBackend; pub use encryption::{ - encryption_method_from_db_encryption_method, Backend, DataKeyManager, DataKeyManagerArgs, - DecrypterReader, EncryptionConfig, Error, FileConfig, Iv, KmsConfig, MasterKeyConfig, Result, + clean_up_dir, clean_up_trash, from_engine_encryption_method, trash_dir_all, Backend, + DataKeyManager, DataKeyManagerArgs, DecrypterReader, EncryptionConfig, Error, FileConfig, Iv, + KmsConfig, MasterKeyConfig, Result, }; use encryption::{ DataKeyPair, EncryptedKey, FileBackend, KmsProvider, PlainKey, PlaintextBackend, @@ -82,7 +83,8 @@ fn create_backend_inner(config: &MasterKeyConfig) -> Result> { }) } -// CloudKMS adapts the KmsProvider definition from the cloud crate to that of the encryption crate +// CloudKMS adapts the KmsProvider definition from the cloud crate to that of +// the encryption crate #[derive(Debug, Deref)] struct CloudKms(Box); diff --git a/components/encryption/src/config.rs b/components/encryption/src/config.rs index 8cb779f1cdc..3fff9064f58 100644 --- a/components/encryption/src/config.rs +++ b/components/encryption/src/config.rs @@ -39,14 +39,14 @@ impl Default for EncryptionConfig { } } -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct FileConfig { pub path: String, } -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, OnlineConfig)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct KmsConfig { @@ -68,7 +68,7 @@ impl KmsConfig { } } -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "kebab-case", tag = "type")] pub enum MasterKeyConfig { // Store encryption metadata as plaintext. Data still get encrypted. Not allowed to use if @@ -111,6 +111,7 @@ mod encryption_method_serde { const AES128_CTR: &str = "aes128-ctr"; const AES192_CTR: &str = "aes192-ctr"; const AES256_CTR: &str = "aes256-ctr"; + const SM4_CTR: &str = "sm4-ctr"; #[allow(clippy::trivially_copy_pass_by_ref)] pub fn serialize(method: &EncryptionMethod, serializer: S) -> Result @@ -123,6 +124,7 @@ mod encryption_method_serde { EncryptionMethod::Aes128Ctr => serializer.serialize_str(AES128_CTR), EncryptionMethod::Aes192Ctr => serializer.serialize_str(AES192_CTR), EncryptionMethod::Aes256Ctr => serializer.serialize_str(AES256_CTR), + EncryptionMethod::Sm4Ctr => serializer.serialize_str(SM4_CTR), } } @@ -149,6 +151,7 @@ mod encryption_method_serde { AES128_CTR => Ok(EncryptionMethod::Aes128Ctr), AES192_CTR => Ok(EncryptionMethod::Aes192Ctr), AES256_CTR => Ok(EncryptionMethod::Aes256Ctr), + SM4_CTR => Ok(EncryptionMethod::Sm4Ctr), _ => Err(E::invalid_value(Unexpected::Str(value), &self)), } } diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index c17560d4a38..7379b8a32a3 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -2,7 +2,7 @@ use byteorder::{BigEndian, ByteOrder}; use derive_more::Deref; -use engine_traits::EncryptionMethod as DBEncryptionMethod; +use engine_traits::EncryptionMethod as EtEncryptionMethod; use kvproto::encryptionpb::EncryptionMethod; use openssl::symm::{self, Cipher as OCipher}; use rand::{rngs::OsRng, RngCore}; @@ -10,36 +10,35 @@ use tikv_util::{box_err, impl_display_as_debug}; use crate::{Error, Result}; -pub fn encryption_method_to_db_encryption_method(method: EncryptionMethod) -> DBEncryptionMethod { +pub fn to_engine_encryption_method(method: EncryptionMethod) -> EtEncryptionMethod { match method { - EncryptionMethod::Plaintext => DBEncryptionMethod::Plaintext, - EncryptionMethod::Aes128Ctr => DBEncryptionMethod::Aes128Ctr, - EncryptionMethod::Aes192Ctr => DBEncryptionMethod::Aes192Ctr, - EncryptionMethod::Aes256Ctr => DBEncryptionMethod::Aes256Ctr, - EncryptionMethod::Unknown => DBEncryptionMethod::Unknown, + EncryptionMethod::Plaintext => EtEncryptionMethod::Plaintext, + EncryptionMethod::Aes128Ctr => EtEncryptionMethod::Aes128Ctr, + EncryptionMethod::Aes192Ctr => EtEncryptionMethod::Aes192Ctr, + EncryptionMethod::Aes256Ctr => EtEncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr => EtEncryptionMethod::Sm4Ctr, + EncryptionMethod::Unknown => EtEncryptionMethod::Unknown, } } -pub fn encryption_method_from_db_encryption_method(method: DBEncryptionMethod) -> EncryptionMethod { +pub fn from_engine_encryption_method(method: EtEncryptionMethod) -> EncryptionMethod { match method { - DBEncryptionMethod::Plaintext => EncryptionMethod::Plaintext, - DBEncryptionMethod::Aes128Ctr => EncryptionMethod::Aes128Ctr, - DBEncryptionMethod::Aes192Ctr => EncryptionMethod::Aes192Ctr, - DBEncryptionMethod::Aes256Ctr => EncryptionMethod::Aes256Ctr, - DBEncryptionMethod::Unknown => EncryptionMethod::Unknown, + EtEncryptionMethod::Plaintext => EncryptionMethod::Plaintext, + EtEncryptionMethod::Aes128Ctr => EncryptionMethod::Aes128Ctr, + EtEncryptionMethod::Aes192Ctr => EncryptionMethod::Aes192Ctr, + EtEncryptionMethod::Aes256Ctr => EncryptionMethod::Aes256Ctr, + EtEncryptionMethod::Sm4Ctr => EncryptionMethod::Sm4Ctr, + EtEncryptionMethod::Unknown => EncryptionMethod::Unknown, } } -pub fn compat(method: EncryptionMethod) -> EncryptionMethod { - method -} - pub fn get_method_key_length(method: EncryptionMethod) -> usize { match method { EncryptionMethod::Plaintext => 0, EncryptionMethod::Aes128Ctr => 16, EncryptionMethod::Aes192Ctr => 24, EncryptionMethod::Aes256Ctr => 32, + EncryptionMethod::Sm4Ctr => 16, unknown => panic!("bad EncryptionMethod {:?}", unknown), } } @@ -53,6 +52,7 @@ const CTR_IV_16: usize = 16; pub enum Iv { Gcm([u8; GCM_IV_12]), Ctr([u8; CTR_IV_16]), + Empty, } impl Iv { @@ -91,6 +91,7 @@ impl Iv { match self { Iv::Ctr(iv) => iv, Iv::Gcm(iv) => iv, + Iv::Empty => &[], } } @@ -102,6 +103,7 @@ impl Iv { Ok(()) } Iv::Gcm(_) => Err(box_err!("offset addition is not supported for GCM mode")), + Iv::Empty => Err(box_err!("empty Iv")), } } } @@ -147,7 +149,7 @@ impl<'k> AesGcmCrypter<'k> { cipher, &self.key.0, Some(self.iv.as_slice()), - &[], /* AAD */ + &[], // AAD pt, &mut tag.0, )?; @@ -160,7 +162,7 @@ impl<'k> AesGcmCrypter<'k> { cipher, &self.key.0, Some(self.iv.as_slice()), - &[], /* AAD */ + &[], // AAD ct, &tag.0, )?; @@ -273,7 +275,7 @@ mod tests { let crypter = AesGcmCrypter::new(&key, iv); let (ciphertext, gcm_tag) = crypter.encrypt(&pt).unwrap(); assert_eq!(ciphertext, ct, "{}", hex::encode(&ciphertext)); - assert_eq!(gcm_tag.0.to_vec(), tag, "{}", hex::encode(&gcm_tag.0)); + assert_eq!(gcm_tag.0.to_vec(), tag, "{}", hex::encode(gcm_tag.0)); let plaintext = crypter.decrypt(&ct, gcm_tag).unwrap(); assert_eq!(plaintext, pt, "{}", hex::encode(&plaintext)); diff --git a/components/encryption/src/encrypted_file/header.rs b/components/encryption/src/encrypted_file/header.rs index 1456f451f62..420b3076adb 100644 --- a/components/encryption/src/encrypted_file/header.rs +++ b/components/encryption/src/encrypted_file/header.rs @@ -7,7 +7,7 @@ use tikv_util::box_err; use crate::Result; -#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[derive(Clone, Copy, PartialEq, Debug)] pub enum Version { // The content only contains the encrypted part. V1 = 1, @@ -39,7 +39,7 @@ impl Version { /// | | Reserved (3 bytes) /// | Version (1 bytes) /// ``` -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct Header { version: Version, crc32: u32, diff --git a/components/encryption/src/encrypted_file/mod.rs b/components/encryption/src/encrypted_file/mod.rs index e52cba85afc..9c76b857c70 100644 --- a/components/encryption/src/encrypted_file/mod.rs +++ b/components/encryption/src/encrypted_file/mod.rs @@ -34,8 +34,8 @@ impl<'a> EncryptedFile<'a> { EncryptedFile { base, name } } - /// Read and decrypt the file. Caller need to handle the NotFound io error in case file not - /// exists. + /// Read and decrypt the file. Caller need to handle the NotFound io error + /// in case file not exists. pub fn read(&self, master_key: &dyn Backend) -> Result> { let start = Instant::now(); let res = OpenOptions::new() @@ -64,7 +64,7 @@ impl<'a> EncryptedFile<'a> { let start = Instant::now(); // Write to a tmp file. // TODO what if a tmp file already exists? - let origin_path = self.base.join(&self.name); + let origin_path = self.base.join(self.name); let mut tmp_path = origin_path.clone(); tmp_path.set_extension(format!("{}.{}", thread_rng().next_u64(), TMP_FILE_SUFFIX)); let mut tmp_file = OpenOptions::new() @@ -92,7 +92,7 @@ impl<'a> EncryptedFile<'a> { // Replace old file with the tmp file aomticlly. rename(tmp_path, origin_path)?; - let base_dir = File::open(&self.base)?; + let base_dir = File::open(self.base)?; base_dir.sync_all()?; ENCRYPT_DECRPTION_FILE_HISTOGRAM @@ -127,7 +127,6 @@ mod tests { let content = b"test content"; file.write(content, &PlaintextBackend::default()).unwrap(); - drop(file); let file = EncryptedFile::new(tmp.path(), "encrypted"); assert_eq!(file.read(&PlaintextBackend::default()).unwrap(), content); diff --git a/components/encryption/src/file_dict_file.rs b/components/encryption/src/file_dict_file.rs index e2dedfe534e..cfa945a5cd7 100644 --- a/components/encryption/src/file_dict_file.rs +++ b/components/encryption/src/file_dict_file.rs @@ -120,7 +120,8 @@ impl FileDictionaryFile { self.base.join(&self.name) } - /// Rewrite the log file to reduce file size and reduce the time of next recovery. + /// Rewrite the log file to reduce file size and reduce the time of next + /// recovery. fn rewrite(&mut self) -> Result<()> { let file_dict_bytes = self.file_dict.write_to_bytes()?; if self.enable_log { @@ -216,10 +217,11 @@ impl FileDictionaryFile { Ok(file_dict) } - /// Append an insert operation to the log file. + /// Append an insert operation to the log file. The record is guaranteed to + /// be persisted if `sync` is set. /// /// Warning: `self.write(file_dict)` must be called before. - pub fn insert(&mut self, name: &str, info: &FileInfo) -> Result<()> { + pub fn insert(&mut self, name: &str, info: &FileInfo, sync: bool) -> Result<()> { self.file_dict.files.insert(name.to_owned(), info.clone()); if self.enable_log { let file = self.append_file.as_mut().unwrap(); @@ -230,12 +232,16 @@ impl FileDictionaryFile { let truncate_num: usize = truncate_num.map_or(0, |c| c.parse().unwrap()); bytes.truncate(truncate_num); file.write_all(&bytes)?; - file.sync_all()?; + if sync { + file.sync_all()?; + } Ok(()) }); file.write_all(&bytes)?; - file.sync_all()?; + if sync { + file.sync_all()?; + } self.file_size += bytes.len(); self.check_compact()?; @@ -249,13 +255,15 @@ impl FileDictionaryFile { /// Append a remove operation to the log file. /// /// Warning: `self.write(file_dict)` must be called before. - pub fn remove(&mut self, name: &str) -> Result<()> { + pub fn remove(&mut self, name: &str, sync: bool) -> Result<()> { self.file_dict.files.remove(name); if self.enable_log { let file = self.append_file.as_mut().unwrap(); let bytes = Self::convert_record_to_bytes(name, LogRecord::Remove)?; file.write_all(&bytes)?; - file.sync_all()?; + if sync { + file.sync_all()?; + } self.removed += 1; self.file_size += bytes.len(); @@ -267,6 +275,13 @@ impl FileDictionaryFile { Ok(()) } + pub fn sync(&mut self) -> Result<()> { + if self.enable_log { + self.append_file.as_mut().unwrap().sync_all()?; + } + Ok(()) + } + /// This function needs to be called after each append operation to check /// if compact is needed. fn check_compact(&mut self) -> Result<()> { @@ -389,7 +404,7 @@ mod tests { use kvproto::encryptionpb::EncryptionMethod; use super::*; - use crate::{crypter::compat, encrypted_file::EncryptedFile, Error}; + use crate::{encrypted_file::EncryptedFile, Error}; fn test_file_dict_file_normal(enable_log: bool) { let tempdir = tempfile::tempdir().unwrap(); @@ -397,7 +412,7 @@ mod tests { tempdir.path(), "test_file_dict_file", enable_log, - 2, /*file_rewrite_threshold*/ + 2, // file_rewrite_threshold ) .unwrap(); let info1 = create_file_info(1, EncryptionMethod::Aes256Ctr); @@ -406,9 +421,9 @@ mod tests { let info4 = create_file_info(4, EncryptionMethod::Aes128Ctr); let info5 = create_file_info(3, EncryptionMethod::Aes128Ctr); - file_dict_file.insert("info1", &info1).unwrap(); - file_dict_file.insert("info2", &info2).unwrap(); - file_dict_file.insert("info3", &info3).unwrap(); + file_dict_file.insert("info1", &info1, true).unwrap(); + file_dict_file.insert("info2", &info2, true).unwrap(); + file_dict_file.insert("info3", &info3, true).unwrap(); let file_dict = file_dict_file.recovery().unwrap(); @@ -417,9 +432,9 @@ mod tests { assert_eq!(*file_dict.files.get("info3").unwrap(), info3); assert_eq!(file_dict.files.len(), 3); - file_dict_file.remove("info2").unwrap(); - file_dict_file.remove("info1").unwrap(); - file_dict_file.insert("info2", &info4).unwrap(); + file_dict_file.remove("info2", true).unwrap(); + file_dict_file.remove("info1", true).unwrap(); + file_dict_file.insert("info2", &info4, true).unwrap(); let file_dict = file_dict_file.recovery().unwrap(); assert_eq!(file_dict.files.get("info1"), None); @@ -427,8 +442,8 @@ mod tests { assert_eq!(*file_dict.files.get("info3").unwrap(), info3); assert_eq!(file_dict.files.len(), 2); - file_dict_file.insert("info5", &info5).unwrap(); - file_dict_file.remove("info3").unwrap(); + file_dict_file.insert("info5", &info5, true).unwrap(); + file_dict_file.remove("info3", true).unwrap(); let file_dict = file_dict_file.recovery().unwrap(); assert_eq!(file_dict.files.get("info1"), None); @@ -440,12 +455,12 @@ mod tests { #[test] fn test_file_dict_file_normal_v1() { - test_file_dict_file_normal(false /*enable_log*/); + test_file_dict_file_normal(false /* enable_log */); } #[test] fn test_file_dict_file_normal_v2() { - test_file_dict_file_normal(true /*enable_log*/); + test_file_dict_file_normal(true /* enable_log */); } fn test_file_dict_file_existed(enable_log: bool) { @@ -454,19 +469,19 @@ mod tests { tempdir.path(), "test_file_dict_file", enable_log, - 2, /*file_rewrite_threshold*/ + 2, // file_rewrite_threshold ) .unwrap(); let info = create_file_info(1, EncryptionMethod::Aes256Ctr); - file_dict_file.insert("info", &info).unwrap(); + file_dict_file.insert("info", &info, true).unwrap(); let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 2, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + true, // enable_log + 2, // file_rewrite_threshold + false, // skip_rewrite ) .unwrap(); assert_eq!(*file_dict.files.get("info").unwrap(), info); @@ -474,12 +489,12 @@ mod tests { #[test] fn test_file_dict_file_existed_v1() { - test_file_dict_file_existed(false /*enable_log*/); + test_file_dict_file_existed(false /* enable_log */); } #[test] fn test_file_dict_file_existed_v2() { - test_file_dict_file_existed(true /*enable_log*/); + test_file_dict_file_existed(true /* enable_log */); } fn test_file_dict_file_not_existed(enable_log: bool) { @@ -488,20 +503,20 @@ mod tests { tempdir.path(), "test_file_dict_file", enable_log, - 2, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + 2, // file_rewrite_threshold + false, // skip_rewrite ); assert!(matches!(ret, Err(Error::Io(_)))); } #[test] fn test_file_dict_file_not_existed_v1() { - test_file_dict_file_not_existed(false /*enable_log*/); + test_file_dict_file_not_existed(false /* enable_log */); } #[test] fn test_file_dict_file_not_existed_v2() { - test_file_dict_file_not_existed(true /*enable_log*/); + test_file_dict_file_not_existed(true /* enable_log */); } #[test] @@ -524,9 +539,9 @@ mod tests { let (_, file_dict_read) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 2, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + true, // enable_log + 2, // file_rewrite_threshold + false, // skip_rewrite ) .unwrap(); assert_eq!(file_dict, file_dict_read); @@ -544,19 +559,19 @@ mod tests { let mut file_dict = FileDictionaryFile::new( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 1000, /*file_rewrite_threshold*/ + true, // enable_log + 1000, // file_rewrite_threshold ) .unwrap(); - file_dict.insert("f1", &info1).unwrap(); - file_dict.insert("f2", &info2).unwrap(); - file_dict.insert("f3", &info3).unwrap(); + file_dict.insert("f1", &info1, true).unwrap(); + file_dict.insert("f2", &info2, true).unwrap(); + file_dict.insert("f3", &info3, true).unwrap(); - file_dict.insert("f4", &info4).unwrap(); - file_dict.remove("f3").unwrap(); + file_dict.insert("f4", &info4, true).unwrap(); + file_dict.remove("f3", true).unwrap(); - file_dict.remove("f2").unwrap(); + file_dict.remove("f2", true).unwrap(); } // Try open as v1 file. Should fail. { @@ -571,9 +586,9 @@ mod tests { let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 1000, /*file_rewrite_threshold*/ - true, /*skip_rewrite*/ + true, // enable_log + 1000, // file_rewrite_threshold + true, // skip_rewrite ) .unwrap(); assert_eq!(*file_dict.files.get("f1").unwrap(), info1); @@ -586,9 +601,9 @@ mod tests { let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - false, /*enable_log*/ - 1000, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + false, // enable_log + 1000, // file_rewrite_threshold + false, // skip_rewrite ) .unwrap(); assert_eq!(*file_dict.files.get("f1").unwrap(), info1); @@ -599,10 +614,9 @@ mod tests { // Try open as v1 file. Should success. { let file_dict_file = EncryptedFile::new(tempdir.path(), "test_file_dict_file"); - let file_bytes = file_dict_file.read(&PlaintextBackend::default()); - assert!(file_bytes.is_ok()); + let file_bytes = file_dict_file.read(&PlaintextBackend::default()).unwrap(); let mut file_dict = FileDictionary::default(); - file_dict.merge_from_bytes(&file_bytes.unwrap()).unwrap(); + file_dict.merge_from_bytes(&file_bytes).unwrap(); assert_eq!(*file_dict.files.get("f1").unwrap(), info1); assert_eq!(file_dict.files.get("f2"), None); assert_eq!(file_dict.files.get("f3"), None); @@ -613,7 +627,7 @@ mod tests { fn create_file_info(id: u64, method: EncryptionMethod) -> FileInfo { FileInfo { key_id: id, - method: compat(method), + method, ..Default::default() } } diff --git a/components/encryption/src/io.rs b/components/encryption/src/io.rs index 6f7d28f61b8..e02aafabe88 100644 --- a/components/encryption/src/io.rs +++ b/components/encryption/src/io.rs @@ -377,6 +377,7 @@ pub fn create_aes_ctr_crypter( EncryptionMethod::Aes128Ctr => OCipher::aes_128_ctr(), EncryptionMethod::Aes192Ctr => OCipher::aes_192_ctr(), EncryptionMethod::Aes256Ctr => OCipher::aes_256_ctr(), + EncryptionMethod::Sm4Ctr => OCipher::sm4_ctr(), }; let crypter = OCrypter::new(cipher, mode, key, Some(iv.as_slice()))?; Ok((cipher, crypter)) @@ -408,7 +409,8 @@ impl CrypterCore { } fn reset_buffer(&mut self, size: usize) { - // OCrypter require the output buffer to have block_size extra bytes, or it will panic. + // OCrypter require the output buffer to have block_size extra bytes, or it will + // panic. self.buffer.resize(size + self.block_size, 0); } @@ -435,9 +437,10 @@ impl CrypterCore { Ok(()) } - /// For simplicity, the following implementation rely on the fact that OpenSSL always - /// return exact same size as input in CTR mode. If it is not true in the future, or we - /// want to support other counter modes, this code needs to be updated. + /// For simplicity, the following implementation rely on the fact that + /// OpenSSL always return exact same size as input in CTR mode. If it is + /// not true in the future, or we want to support other counter modes, + /// this code needs to be updated. pub fn do_crypter_in_place(&mut self, buf: &mut [u8]) -> IoResult<()> { if self.crypter.is_none() { self.reset_crypter(0)?; @@ -525,6 +528,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let ivs = [ Iv::new_ctr(), @@ -593,6 +597,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let mut plaintext = vec![0; 10240]; OsRng.fill_bytes(&mut plaintext); @@ -628,6 +633,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let mut plaintext = vec![0; 10240]; OsRng.fill_bytes(&mut plaintext); @@ -688,9 +694,8 @@ mod tests { buf: &mut [u8], ) -> Poll> { let len = min(self.read_maxsize_once, buf.len()); - let r = self.cursor.read(&mut buf[..len]); - assert!(r.is_ok()); - Poll::Ready(IoResult::Ok(r.unwrap())) + let r = self.cursor.read(&mut buf[..len]).unwrap(); + Poll::Ready(IoResult::Ok(r)) } } @@ -700,6 +705,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let iv = Iv::new_ctr(); let mut plain_text = vec![0; 10240]; @@ -720,11 +726,10 @@ mod tests { let mut encrypt_read_len = 0; loop { - let s = encrypt_reader + let read_len = encrypt_reader .read(&mut encrypt_text[encrypt_read_len..]) - .await; - assert!(s.is_ok()); - let read_len = s.unwrap(); + .await + .unwrap(); if read_len == 0 { break; } @@ -750,11 +755,10 @@ mod tests { .unwrap(); loop { - let s = decrypt_reader + let read_len = decrypt_reader .read(&mut decrypt_text[decrypt_read_len..]) - .await; - assert!(s.is_ok()); - let read_len = s.unwrap(); + .await + .unwrap(); if read_len == 0 { break; } diff --git a/components/encryption/src/lib.rs b/components/encryption/src/lib.rs index e6498e5d3ab..c16142eb30b 100644 --- a/components/encryption/src/lib.rs +++ b/components/encryption/src/lib.rs @@ -10,12 +10,13 @@ mod manager; mod master_key; mod metrics; +use std::{io::ErrorKind, path::Path}; + pub use self::{ config::*, crypter::{ - compat, encryption_method_from_db_encryption_method, - encryption_method_to_db_encryption_method, verify_encryption_config, AesGcmCrypter, Iv, - PlainKey, + from_engine_encryption_method, to_engine_encryption_method, verify_encryption_config, + AesGcmCrypter, Iv, PlainKey, }, encrypted_file::EncryptedFile, errors::{Error, Result, RetryCodedError}, @@ -28,3 +29,120 @@ pub use self::{ Backend, DataKeyPair, EncryptedKey, FileBackend, KmsBackend, KmsProvider, PlaintextBackend, }, }; + +const TRASH_PREFIX: &str = "TRASH-"; + +/// Remove a directory. +/// +/// Rename it before actually removal. +#[inline] +pub fn trash_dir_all( + path: impl AsRef, + key_manager: Option<&DataKeyManager>, +) -> std::io::Result<()> { + let path = path.as_ref(); + let name = match path.file_name() { + Some(n) => n, + None => { + return Err(std::io::Error::new( + ErrorKind::InvalidInput, + "path is invalid", + )); + } + }; + let trash_path = path.with_file_name(format!("{}{}", TRASH_PREFIX, name.to_string_lossy())); + if let Err(e) = file_system::rename(path, &trash_path) { + if e.kind() == ErrorKind::NotFound { + return Ok(()); + } + return Err(e); + } else if let Some(m) = key_manager { + m.remove_dir(path, Some(&trash_path))?; + } + file_system::remove_dir_all(trash_path) +} + +/// When using `trash_dir_all`, it's possible the directory is marked as trash +/// but not being actually deleted after a restart. This function can be used +/// to resume all those removal in the given directory. +#[inline] +pub fn clean_up_trash( + path: impl AsRef, + key_manager: Option<&DataKeyManager>, +) -> std::io::Result<()> { + for e in file_system::read_dir(path)? { + let e = e?; + let os_fname = e.file_name(); + let fname = os_fname.to_str().unwrap(); + if let Some(original) = fname.strip_prefix(TRASH_PREFIX) { + let original = e.path().with_file_name(original); + if let Some(m) = &key_manager { + m.remove_dir(&original, Some(&e.path()))?; + } + file_system::remove_dir_all(e.path())?; + } + } + Ok(()) +} + +/// Removes all directories with the given prefix. +#[inline] +pub fn clean_up_dir( + path: impl AsRef, + prefix: &str, + key_manager: Option<&DataKeyManager>, +) -> std::io::Result<()> { + for e in file_system::read_dir(path)? { + let e = e?; + let fname = e.file_name().to_str().unwrap().to_owned(); + if fname.starts_with(prefix) { + if let Some(m) = &key_manager { + m.remove_dir(&e.path(), None)?; + } + file_system::remove_dir_all(e.path())?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use tempfile::Builder; + + use super::*; + + #[test] + fn test_trash_dir_all() { + let tmp_dir = Builder::new() + .prefix("test_reserve_space_for_recover") + .tempdir() + .unwrap(); + let data_path = tmp_dir.path(); + let sub_dir0 = data_path.join("sub_dir0"); + let trash_sub_dir0 = data_path.join(format!("{}sub_dir0", TRASH_PREFIX)); + file_system::create_dir_all(&sub_dir0).unwrap(); + assert!(sub_dir0.exists()); + + trash_dir_all(&sub_dir0, None).unwrap(); + assert!(!sub_dir0.exists()); + assert!(!trash_sub_dir0.exists()); + + file_system::create_dir_all(&sub_dir0).unwrap(); + file_system::create_dir_all(&trash_sub_dir0).unwrap(); + trash_dir_all(&sub_dir0, None).unwrap(); + assert!(!sub_dir0.exists()); + assert!(!trash_sub_dir0.exists()); + + clean_up_trash(data_path, None).unwrap(); + + file_system::create_dir_all(&trash_sub_dir0).unwrap(); + assert!(trash_sub_dir0.exists()); + clean_up_trash(data_path, None).unwrap(); + assert!(!trash_sub_dir0.exists()); + + file_system::create_dir_all(&sub_dir0).unwrap(); + assert!(sub_dir0.exists()); + clean_up_dir(data_path, "sub", None).unwrap(); + assert!(!sub_dir0.exists()); + } +} diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 0535cae16f1..be7008a33ae 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -1,7 +1,8 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - io::{Error as IoError, ErrorKind, Result as IoResult}, + collections::hash_map::Entry, + io::{self, Error as IoError, ErrorKind, Result as IoResult}, path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, @@ -12,16 +13,19 @@ use std::{ }; use crossbeam::channel::{self, select, tick}; -use engine_traits::{EncryptionKeyManager, FileEncryptionInfo}; +use engine_traits::{ + EncryptionKeyManager, EncryptionMethod as EtEncryptionMethod, FileEncryptionInfo, +}; use fail::fail_point; use file_system::File; use kvproto::encryptionpb::{DataKey, EncryptionMethod, FileDictionary, FileInfo, KeyDictionary}; use protobuf::Message; -use tikv_util::{box_err, debug, error, info, thd_name, warn}; +use tikv_util::{box_err, debug, error, info, sys::thread::StdThreadBuildWrapper, thd_name, warn}; +use tokio::sync::oneshot; use crate::{ config::EncryptionConfig, - crypter::{self, compat, Iv}, + crypter::{self, Iv}, encrypted_file::EncryptedFile, file_dict_file::FileDictionaryFile, io::{DecrypterReader, EncrypterWriter}, @@ -33,6 +37,7 @@ use crate::{ const KEY_DICT_NAME: &str = "key.dict"; const FILE_DICT_NAME: &str = "file.dict"; const ROTATE_CHECK_PERIOD: u64 = 600; // 10min +const GENERATE_DATA_KEY_LIMIT: usize = 10; struct Dicts { // Maps data file paths to key id and metadata. This file is stored as plaintext. @@ -190,13 +195,17 @@ impl Dicts { dict.files.get(fname).cloned() } - fn new_file(&self, fname: &str, method: EncryptionMethod) -> Result { + fn new_file(&self, fname: &str, method: EncryptionMethod, sync: bool) -> Result { let mut file_dict_file = self.file_dict_file.lock().unwrap(); - let iv = Iv::new_ctr(); + let iv = if method != EncryptionMethod::Plaintext { + Iv::new_ctr() + } else { + Iv::Empty + }; let file = FileInfo { iv: iv.as_slice().to_vec(), key_id: self.current_key_id.load(Ordering::SeqCst), - method: compat(method), + method, ..Default::default() }; let file_num = { @@ -205,7 +214,7 @@ impl Dicts { file_dict.files.len() as _ }; - file_dict_file.insert(fname, &file)?; + file_dict_file.insert(fname, &file, sync)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); if method != EncryptionMethod::Plaintext { @@ -221,7 +230,7 @@ impl Dicts { // If the file does not exist, return Ok(()) // In either case the intent that the file not exist is achieved. - fn delete_file(&self, fname: &str) -> Result<()> { + fn delete_file(&self, fname: &str, sync: bool) -> Result<()> { let mut file_dict_file = self.file_dict_file.lock().unwrap(); let (file, file_num) = { let mut file_dict = self.file_dict.lock().unwrap(); @@ -239,9 +248,9 @@ impl Dicts { } }; - file_dict_file.remove(fname)?; + file_dict_file.remove(fname, sync)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); - if file.method != compat(EncryptionMethod::Plaintext) { + if file.method != EncryptionMethod::Plaintext { debug!("delete encrypted file"; "fname" => fname); } else { debug!("delete plaintext file"; "fname" => fname); @@ -249,7 +258,7 @@ impl Dicts { Ok(()) } - fn link_file(&self, src_fname: &str, dst_fname: &str) -> Result> { + fn link_file(&self, src_fname: &str, dst_fname: &str, sync: bool) -> Result> { let mut file_dict_file = self.file_dict_file.lock().unwrap(); let (method, file, file_num) = { let mut file_dict = self.file_dict.lock().unwrap(); @@ -261,19 +270,19 @@ impl Dicts { return Ok(None); } }; - // When an encrypted file exists in the file system, the file_dict must have info about - // this file. But the opposite is not true, this is because the actual file operation - // and file_dict operation are not atomic. + // When an encrypted file exists in the file system, the file_dict must have + // info about this file. But the opposite is not true, this is because the + // actual file operation and file_dict operation are not atomic. check_stale_file_exist(dst_fname, &mut file_dict, &mut file_dict_file)?; let method = file.method; file_dict.files.insert(dst_fname.to_owned(), file.clone()); let file_num = file_dict.files.len() as _; (method, file, file_num) }; - file_dict_file.insert(dst_fname, &file)?; + file_dict_file.insert(dst_fname, &file, sync)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); - if method != compat(EncryptionMethod::Plaintext) { + if method != EncryptionMethod::Plaintext { info!("link encrypted file"; "src" => src_fname, "dst" => dst_fname); } else { info!("link plaintext file"; "src" => src_fname, "dst" => dst_fname); @@ -281,11 +290,15 @@ impl Dicts { Ok(Some(())) } - fn rotate_key(&self, key_id: u64, key: DataKey, master_key: &dyn Backend) -> Result<()> { + fn rotate_key(&self, key_id: u64, key: DataKey, master_key: &dyn Backend) -> Result { info!("encryption: rotate data key."; "key_id" => key_id); { let mut key_dict = self.key_dict.lock().unwrap(); - key_dict.keys.insert(key_id, key); + match key_dict.keys.entry(key_id) { + // key id collides + Entry::Occupied(_) => return Ok(false), + Entry::Vacant(e) => e.insert(key), + }; key_dict.current_key_id = key_id; }; @@ -293,7 +306,7 @@ impl Dicts { self.save_key_dict(master_key)?; // Update current data key id. self.current_key_id.store(key_id, Ordering::SeqCst); - Ok(()) + Ok(true) } fn maybe_rotate_data_key( @@ -310,7 +323,7 @@ impl Dicts { // Generate a new data key if // 1. encryption method is not the same, or // 2. the current data key was exposed and the new master key is secure. - if compat(method) == key.method && !(key.was_exposed && master_key.is_secure()) { + if method == key.method && !(key.was_exposed && master_key.is_secure()) { let creation_time = UNIX_EPOCH + Duration::from_secs(key.creation_time); match now.duration_since(creation_time) { Ok(duration) => { @@ -331,15 +344,32 @@ impl Dicts { let duration = now.duration_since(UNIX_EPOCH).unwrap(); let creation_time = duration.as_secs(); - let (key_id, key) = generate_data_key(method); - let data_key = DataKey { - key, - method: compat(method), - creation_time, - was_exposed: false, - ..Default::default() - }; - self.rotate_key(key_id, data_key, master_key) + // Generate new data key. + for _ in 0..GENERATE_DATA_KEY_LIMIT { + let (key_id, key) = generate_data_key(method); + if key_id == 0 { + // 0 is invalid + continue; + } + let data_key = DataKey { + key, + method, + creation_time, + was_exposed: false, + ..Default::default() + }; + + let ok = self.rotate_key(key_id, data_key, master_key)?; + if !ok { + // key id collides, retry + continue; + } + return Ok(()); + } + Err(box_err!( + "key id collides {} times!", + GENERATE_DATA_KEY_LIMIT + )) } } @@ -359,17 +389,22 @@ fn check_stale_file_exist( "Clean stale file information in file dictionary: {:?}", fname ); - file_dict_file.remove(fname)?; + file_dict_file.remove(fname, true)?; let _ = file_dict.files.remove(fname); } Ok(()) } +enum RotateTask { + Terminate, + Save(oneshot::Sender<()>), +} + fn run_background_rotate_work( dict: Arc, method: EncryptionMethod, master_key: &dyn Backend, - terminal_recv: channel::Receiver<()>, + rx: channel::Receiver, ) { let check_period = std::cmp::min( Duration::from_secs(ROTATE_CHECK_PERIOD), @@ -383,9 +418,17 @@ fn run_background_rotate_work( dict.maybe_rotate_data_key(method, master_key) .expect("Rotating key operation encountered error in the background worker"); }, - recv(terminal_recv) -> _ => { - info!("Key rotate worker has been cancelled."); - break + recv(rx) -> r => { + match r { + Err(_) | Ok(RotateTask::Terminate) => { + info!("Key rotate worker has been cancelled."); + return; + } + Ok(RotateTask::Save(tx)) => { + dict.save_key_dict(master_key).expect("Saving key dict encountered error in the background worker"); + tx.send(()).unwrap(); + } + } }, } } @@ -404,7 +447,7 @@ fn generate_data_key(method: EncryptionMethod) -> (u64, Vec) { pub struct DataKeyManager { dicts: Arc, method: EncryptionMethod, - rotate_terminal: channel::Sender<()>, + rotate_tx: channel::Sender, background_worker: Option>, } @@ -464,6 +507,24 @@ impl DataKeyManager { Ok(Some(Self::from_dicts(dicts, args.method, master_key)?)) } + /// Will block file operation for a considerable amount of time. Only used + /// for debugging purpose. + pub fn retain_encrypted_files(&self, f: impl Fn(&str) -> bool) { + let mut dict = self.dicts.file_dict.lock().unwrap(); + let mut file_dict_file = self.dicts.file_dict_file.lock().unwrap(); + dict.files.retain(|fname, info| { + if info.method != EncryptionMethod::Plaintext { + let retain = f(fname); + if !retain { + file_dict_file.remove(fname, true).unwrap(); + } + retain + } else { + false + } + }); + } + fn load_dicts(master_key: &dyn Backend, args: &DataKeyManagerArgs) -> Result { if args.method != EncryptionMethod::Plaintext && !master_key.is_secure() { return Err(box_err!( @@ -474,7 +535,7 @@ impl DataKeyManager { Dicts::open( &args.dict_path, args.rotation_period, - &*master_key, + master_key, args.enable_file_dictionary_log, args.file_dictionary_rewrite_threshold, ), @@ -540,7 +601,7 @@ impl DataKeyManager { )) })?; // Rewrite key_dict after replace master key. - dicts.save_key_dict(&*master_key)?; + dicts.save_key_dict(master_key)?; info!("encryption: persisted result after replace master key."); Ok(dicts) @@ -554,10 +615,10 @@ impl DataKeyManager { dicts.maybe_rotate_data_key(method, &*master_key)?; let dicts = Arc::new(dicts); let dict_clone = dicts.clone(); - let (rotate_terminal, rx) = channel::bounded(1); + let (rotate_tx, rx) = channel::bounded(1); let background_worker = std::thread::Builder::new() .name(thd_name!("enc:key")) - .spawn(move || { + .spawn_wrapper(move || { run_background_rotate_work(dict_clone, method, &*master_key, rx); })?; @@ -566,14 +627,14 @@ impl DataKeyManager { Ok(DataKeyManager { dicts, method, - rotate_terminal, + rotate_tx, background_worker: Some(background_worker), }) } pub fn create_file_for_write>(&self, path: P) -> Result> { let file_writer = File::create(&path)?; - self.open_file_with_writer(path, file_writer, true /*create*/) + self.open_file_with_writer(path, file_writer, true /* create */) } pub fn open_file_with_writer, W: std::io::Write>( @@ -595,9 +656,14 @@ impl DataKeyManager { }; EncrypterWriter::new( writer, - crypter::encryption_method_from_db_encryption_method(file.method), + crypter::from_engine_encryption_method(file.method), &file.key, - Iv::from_slice(&file.iv)?, + if file.method == EtEncryptionMethod::Plaintext { + debug_assert!(file.iv.is_empty()); + Iv::Empty + } else { + Iv::from_slice(&file.iv)? + }, ) } @@ -620,9 +686,14 @@ impl DataKeyManager { let file = self.get_file(fname)?; DecrypterReader::new( reader, - crypter::encryption_method_from_db_encryption_method(file.method), + crypter::from_engine_encryption_method(file.method), &file.key, - Iv::from_slice(&file.iv)?, + if file.method == EtEncryptionMethod::Plaintext { + debug_assert!(file.iv.is_empty()); + Iv::Empty + } else { + Iv::from_slice(&file.iv)? + }, ) } @@ -654,9 +725,9 @@ impl DataKeyManager { let (_, file_dict) = FileDictionaryFile::open( dict_path, FILE_DICT_NAME, - true, /*enable_file_dictionary_log*/ + true, // enable_file_dictionary_log 1, - true, /*skip_rewrite*/ + true, // skip_rewrite )?; if let Some(file_path) = file_path { if let Some(info) = file_dict.files.get(file_path) { @@ -693,16 +764,65 @@ impl DataKeyManager { }; let encrypted_file = FileEncryptionInfo { key, - method: crypter::encryption_method_to_db_encryption_method(method), + method: crypter::to_engine_encryption_method(method), iv, }; Ok(Some(encrypted_file)) } + + /// Removes data keys under the directory `logical`. If `physical` is + /// present, if means the `logical` directory is already physically renamed + /// to `physical`. + /// There're two uses of this function: + /// + /// (1) without `physical`: `remove_dir` is called before + /// `fs::remove_dir_all`. User must guarantee that this directory won't be + /// read again even if the removal fails or panics. + /// + /// (2) with `physical`: Use `fs::rename` to rename the directory to trash. + /// Then `remove_dir` with `physical` set to the trash directory name. + /// Finally remove the trash directory. This is the safest way to delete a + /// directory. + pub fn remove_dir(&self, logical: &Path, physical: Option<&Path>) -> IoResult<()> { + let scan = physical.unwrap_or(logical); + debug_assert!(scan.is_dir()); + if !scan.exists() { + return Ok(()); + } + let mut iter = walkdir::WalkDir::new(scan).into_iter().peekable(); + while let Some(e) = iter.next() { + let e = e?; + if e.path_is_symlink() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected symbolic link: {}", e.path().display()), + )); + } + let fname = e.path().to_str().unwrap(); + let sync = iter.peek().is_none(); + if let Some(p) = physical { + let sub = fname + .strip_prefix(p.to_str().unwrap()) + .unwrap() + .trim_start_matches('/'); + self.dicts + .delete_file(logical.join(sub).to_str().unwrap(), sync)?; + } else { + self.dicts.delete_file(fname, sync)?; + } + } + Ok(()) + } + + /// Return which method this manager is using. + pub fn encryption_method(&self) -> engine_traits::EncryptionMethod { + crypter::to_engine_encryption_method(self.method) + } } impl Drop for DataKeyManager { fn drop(&mut self) { - if let Err(e) = self.rotate_terminal.send(()) { + if let Err(e) = self.rotate_tx.send(RotateTask::Terminate) { info!("failed to terminate background rotation, are we shutting down?"; "err" => %e); } if let Some(Err(e)) = self.background_worker.take().map(|w| w.join()) { @@ -720,10 +840,10 @@ impl EncryptionKeyManager for DataKeyManager { // Return Plaintext if file is not found // RocksDB requires this let file = FileInfo::default(); - let method = compat(EncryptionMethod::Plaintext); + let method = EncryptionMethod::Plaintext; Ok(FileEncryptionInfo { key: vec![], - method: crypter::encryption_method_to_db_encryption_method(method), + method: crypter::to_engine_encryption_method(method), iv: file.iv, }) } @@ -734,10 +854,10 @@ impl EncryptionKeyManager for DataKeyManager { fn new_file(&self, fname: &str) -> IoResult { let (_, data_key) = self.dicts.current_data_key(); let key = data_key.get_key().to_owned(); - let file = self.dicts.new_file(fname, self.method)?; + let file = self.dicts.new_file(fname, self.method, true)?; let encrypted_file = FileEncryptionInfo { key, - method: crypter::encryption_method_to_db_encryption_method(file.method), + method: crypter::to_engine_encryption_method(file.method), iv: file.get_iv().to_owned(), }; Ok(encrypted_file) @@ -747,19 +867,169 @@ impl EncryptionKeyManager for DataKeyManager { fail_point!("key_manager_fails_before_delete_file", |_| IoResult::Err( std::io::ErrorKind::Other.into() )); - self.dicts.delete_file(fname)?; + // `RemoveDir` is not managed, but RocksDB may use `RenameFile` on a directory, + // which internally calls `LinkFile` and `DeleteFile`. + let path = Path::new(fname); + if path.is_dir() { + let mut iter = walkdir::WalkDir::new(path).into_iter().peekable(); + while let Some(e) = iter.next() { + self.dicts + .delete_file(e?.path().to_str().unwrap(), iter.peek().is_none())?; + } + } else { + self.dicts.delete_file(fname, true)?; + } Ok(()) } fn link_file(&self, src_fname: &str, dst_fname: &str) -> IoResult<()> { - self.dicts.link_file(src_fname, dst_fname)?; + let src_path = Path::new(src_fname); + let dst_path = Path::new(dst_fname); + if src_path.is_dir() { + let mut iter = walkdir::WalkDir::new(src_path) + .into_iter() + .filter(|e| e.as_ref().map_or(true, |e| !e.path().is_dir())) + .peekable(); + while let Some(e) = iter.next() { + let e = e?; + if e.path_is_symlink() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected symbolic link: {}", e.path().display()), + )); + } + let sub_path = e.path().strip_prefix(src_path).unwrap(); + let src = e.path().to_str().unwrap(); + let dst_path = dst_path.join(sub_path); + let dst = dst_path.to_str().unwrap(); + self.dicts.link_file(src, dst, iter.peek().is_none())?; + } + } else { + self.dicts.link_file(src_fname, dst_fname, true)?; + } + Ok(()) + } +} + +/// An RAII-style importer of data keys. It automatically creates data key that +/// doesn't exist locally. It synchronizes log file in batch. It automatically +/// reverts changes if caller aborts. +pub struct DataKeyImporter<'a> { + manager: &'a DataKeyManager, + // Added file names. + file_additions: Vec, + // Added key ids. + key_additions: Vec, + committed: bool, +} + +#[allow(dead_code)] +impl<'a> DataKeyImporter<'a> { + pub fn new(manager: &'a DataKeyManager) -> Self { + Self { + manager, + file_additions: Vec::new(), + key_additions: Vec::new(), + committed: false, + } + } + + pub fn add(&mut self, fname: &str, iv: Vec, new_key: DataKey) -> Result<()> { + let method = new_key.method; + let mut key_id = None; + { + let mut key_dict = self.manager.dicts.key_dict.lock().unwrap(); + for (id, data_key) in &key_dict.keys { + if data_key.key == new_key.key { + key_id = Some(*id); + } + } + if key_id.is_none() { + for _ in 0..GENERATE_DATA_KEY_LIMIT { + // Match `generate_data_key`. + use rand::{rngs::OsRng, RngCore}; + let id = OsRng.next_u64(); + if let Entry::Vacant(e) = key_dict.keys.entry(id) { + key_id = Some(id); + e.insert(new_key); + self.key_additions.push(id); + break; + } + } + if key_id.is_none() { + return Err(box_err!( + "key id collides {} times!", + GENERATE_DATA_KEY_LIMIT + )); + } + } + } + + let file = FileInfo { + iv, + key_id: key_id.unwrap(), + method, + ..Default::default() + }; + let mut file_dict_file = self.manager.dicts.file_dict_file.lock().unwrap(); + let file_num = { + let mut file_dict = self.manager.dicts.file_dict.lock().unwrap(); + if let Entry::Vacant(e) = file_dict.files.entry(fname.to_owned()) { + e.insert(file.clone()); + } else { + return Err(box_err!("file name collides with existing file: {}", fname)); + } + file_dict.files.len() as _ + }; + file_dict_file.insert(fname, &file, false)?; + self.file_additions.push(fname.to_owned()); + ENCRYPTION_FILE_NUM_GAUGE.set(file_num); Ok(()) } + + pub fn commit(mut self) -> Result<()> { + let (tx, rx) = oneshot::channel(); + if !self.key_additions.is_empty() { + self.manager.rotate_tx.send(RotateTask::Save(tx)).unwrap(); + rx.blocking_recv().unwrap(); + } + if !self.file_additions.is_empty() { + self.manager.dicts.file_dict_file.lock().unwrap().sync()?; + } + self.committed = true; + Ok(()) + } + + pub fn rollback(&mut self) -> Result<()> { + assert!(!self.committed); + let mut iter = self.file_additions.drain(..).peekable(); + while let Some(f) = iter.next() { + self.manager.dicts.delete_file(&f, iter.peek().is_none())?; + } + for key_id in self.key_additions.drain(..) { + let mut key_dict = self.manager.dicts.key_dict.lock().unwrap(); + key_dict.keys.remove(&key_id); + } + let (tx, rx) = oneshot::channel(); + self.manager.rotate_tx.send(RotateTask::Save(tx)).unwrap(); + rx.blocking_recv().unwrap(); + Ok(()) + } +} + +impl<'a> Drop for DataKeyImporter<'a> { + fn drop(&mut self) { + if !self.committed { + if let Err(e) = self.rollback() { + warn!("failed to rollback imported data keys"; "err" => ?e); + } + } + } } #[cfg(test)] mod tests { - use engine_traits::EncryptionMethod as DBEncryptionMethod; + use engine_traits::EncryptionMethod as EtEncryptionMethod; use file_system::{remove_file, File}; use matches::assert_matches; use tempfile::TempDir; @@ -776,7 +1046,7 @@ mod tests { } fn new_mock_backend() -> Box { - Box::new(MockBackend::default()) + Box::::default() } fn new_key_manager_def( @@ -790,7 +1060,7 @@ mod tests { } match DataKeyManager::new_previous_loaded( master_backend, - Box::new(MockBackend::default()), + Box::::default(), args, ) { Ok(None) => panic!("expected encryption"), @@ -805,7 +1075,7 @@ mod tests { rotation_period: Duration::from_secs(60), enable_file_dictionary_log: true, file_dictionary_rewrite_threshold: 2, - dict_path: tmp_dir.path().as_os_str().to_str().unwrap().to_string(), + dict_path: tmp_dir.path().to_str().unwrap().to_string(), } } @@ -882,7 +1152,7 @@ mod tests { let foo3 = manager.get_file("foo").unwrap(); assert_eq!(foo1, foo3); let bar = manager.new_file("bar").unwrap(); - assert_eq!(bar.method, DBEncryptionMethod::Plaintext); + assert_eq!(bar.method, EtEncryptionMethod::Plaintext); } // When enabling encryption, using insecure master key is not allowed. @@ -893,7 +1163,7 @@ mod tests { let manager = new_key_manager( &tmp_dir, Some(EncryptionMethod::Aes256Ctr), - Box::new(PlaintextBackend::default()), + Box::::default(), new_mock_backend() as Box, ); manager.err().unwrap(); @@ -1262,13 +1532,283 @@ mod tests { encrypt_fail: false, ..MockBackend::default() }); - let previous = Box::new(PlaintextBackend::default()) as Box; + let previous = Box::::default() as Box; let result = new_key_manager(&tmp_dir, None, wrong_key, previous); - // When the master key is invalid, the key manager left a empty file dict and return errors. + // When the master key is invalid, the key manager left a empty file dict and + // return errors. assert!(result.is_err()); - let previous = Box::new(PlaintextBackend::default()) as Box; - let result = new_key_manager(&tmp_dir, None, right_key, previous); - assert!(result.is_ok()); + let previous = Box::::default() as Box; + new_key_manager(&tmp_dir, None, right_key, previous).unwrap(); + } + + #[test] + fn test_plaintext_encrypter_writer() { + use std::io::{Read, Write}; + + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + let (key_path, _tmp_key_dir) = create_key_file("key"); + let master_key_backend = + Box::new(FileBackend::new(key_path.as_path()).unwrap()) as Box; + let tmp_dir = tempfile::TempDir::new().unwrap(); + let previous = new_mock_backend() as Box; + let manager = new_key_manager(&tmp_dir, None, master_key_backend, previous).unwrap(); + let path = tmp_dir.path().join("nonencyrpted"); + let content = "I'm exposed.".to_string(); + { + let raw = File::create(&path).unwrap(); + let mut f = manager + .open_file_with_writer(&path, raw, false /* create */) + .unwrap(); + f.write_all(content.as_bytes()).unwrap(); + f.sync_all().unwrap(); + } + { + let mut buffer = String::new(); + let mut f = File::open(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), content.len()); + assert_eq!(buffer, content); + } + { + let mut buffer = String::new(); + let mut f = manager.open_file_for_read(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), content.len()); + assert_eq!(buffer, content); + } + } + + fn generate_mock_file>(dkm: Option<&DataKeyManager>, path: P, content: &String) { + use std::io::Write; + match dkm { + Some(manager) => { + // Encryption enabled. Use DataKeyManager to manage file. + let mut f = manager.create_file_for_write(&path).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + f.sync_all().unwrap(); + } + None => { + // Encryption disabled. Write content in plaintext. + let mut f = File::create(&path).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + f.sync_all().unwrap(); + } + } + } + + fn check_mock_file_content>( + dkm: Option<&DataKeyManager>, + path: P, + expected: &String, + ) { + use std::io::Read; + + match dkm { + Some(manager) => { + let mut buffer = String::new(); + let mut f = manager.open_file_for_read(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), expected.len()); + assert_eq!(buffer, expected.to_string()); + } + None => { + let mut buffer = String::new(); + let mut f = File::open(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), expected.len()); + assert_eq!(buffer, expected.to_string()); + } + } + } + + fn test_change_method(from: EncryptionMethod, to: EncryptionMethod) { + if from == to { + return; + } + + let generate_file_name = |method| format!("{:?}", method); + let generate_file_content = |method| format!("Encrypted with {:?}", method); + let tmp_dir = tempfile::TempDir::new().unwrap(); + let (key_path, _tmp_key_dir) = create_key_file("key"); + let master_key_backend = + Box::new(FileBackend::new(key_path.as_path()).unwrap()) as Box; + let previous = new_mock_backend() as Box; + let path_to_file1 = tmp_dir.path().join(generate_file_name(from)); + let content1 = generate_file_content(from); + + if from == EncryptionMethod::Plaintext { + // encryption not enabled. + let mut args = def_data_key_args(&tmp_dir); + args.method = EncryptionMethod::Plaintext; + let manager = + DataKeyManager::new(master_key_backend, Box::new(move || Ok(previous)), args) + .unwrap(); + assert!(manager.is_none()); + generate_mock_file(None, &path_to_file1, &content1); + check_mock_file_content(None, &path_to_file1, &content1); + } else { + let manager = + new_key_manager(&tmp_dir, Some(from), master_key_backend, previous).unwrap(); + + generate_mock_file(Some(&manager), &path_to_file1, &content1); + check_mock_file_content(Some(&manager), &path_to_file1, &content1); + // Close old manager + drop(manager); + } + + // re-open with new encryption/plaintext algorithm. + let master_key_backend = + Box::new(FileBackend::new(key_path.as_path()).unwrap()) as Box; + let previous = new_mock_backend() as Box; + let manager = new_key_manager(&tmp_dir, Some(to), master_key_backend, previous).unwrap(); + let path_to_file2 = tmp_dir.path().join(generate_file_name(to)); + + let content2 = generate_file_content(to); + generate_mock_file(Some(&manager), &path_to_file2, &content2); + check_mock_file_content(Some(&manager), &path_to_file2, &content2); + // check old file content + check_mock_file_content(Some(&manager), &path_to_file1, &content1); + } + + #[test] + fn test_encryption_algorithm_switch() { + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + + let method_list = [ + EncryptionMethod::Plaintext, + EncryptionMethod::Aes128Ctr, + EncryptionMethod::Aes192Ctr, + EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, + ]; + for from in method_list { + for to in method_list { + test_change_method(from, to) + } + } + } + + #[test] + fn test_rename_dir() { + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + let tmp_dir = tempfile::TempDir::new().unwrap(); + let manager = new_key_manager_def(&tmp_dir, Some(EncryptionMethod::Aes192Ctr)).unwrap(); + let subdir = tmp_dir.path().join("foo"); + std::fs::create_dir(&subdir).unwrap(); + let file_a = manager + .new_file(subdir.join("a").to_str().unwrap()) + .unwrap(); + File::create(subdir.join("a")).unwrap(); + let file_b = manager + .new_file(subdir.join("b").to_str().unwrap()) + .unwrap(); + File::create(subdir.join("b")).unwrap(); + + let dstdir = tmp_dir.path().join("bar"); + manager + .link_file(subdir.to_str().unwrap(), dstdir.to_str().unwrap()) + .unwrap(); + manager.delete_file(subdir.to_str().unwrap()).unwrap(); + + assert_eq!( + manager + .get_file(dstdir.join("a").to_str().unwrap()) + .unwrap(), + file_a + ); + assert_eq!( + manager + .get_file_exists(subdir.join("a").to_str().unwrap()) + .unwrap(), + None + ); + + assert_eq!( + manager + .get_file(dstdir.join("b").to_str().unwrap()) + .unwrap(), + file_b + ); + assert_eq!( + manager + .get_file_exists(subdir.join("b").to_str().unwrap()) + .unwrap(), + None + ); + } + + #[test] + fn test_import_keys() { + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + let tmp_dir = tempfile::TempDir::new().unwrap(); + let manager = new_key_manager_def(&tmp_dir, Some(EncryptionMethod::Aes192Ctr)).unwrap(); + + let mut importer = DataKeyImporter::new(&manager); + let file0 = manager.new_file("0").unwrap(); + + // conflict + importer + .add("0", file0.iv.clone(), DataKey::default()) + .unwrap_err(); + // same key + importer + .add( + "1", + file0.iv.clone(), + DataKey { + key: file0.key.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + // different key + let (_, key2) = generate_data_key(EncryptionMethod::Aes192Ctr); + importer + .add( + "2", + Iv::new_ctr().as_slice().to_owned(), + DataKey { + key: key2.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + + assert_eq!(manager.get_file("0").unwrap(), file0); + assert_eq!(manager.get_file("1").unwrap(), file0); + assert_eq!(manager.get_file("2").unwrap().key, key2); + + drop(importer); + assert_eq!(manager.get_file_exists("1").unwrap(), None); + assert_eq!(manager.get_file_exists("2").unwrap(), None); + + let mut importer = DataKeyImporter::new(&manager); + // same key + importer + .add( + "1", + file0.iv.clone(), + DataKey { + key: file0.key.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + // different key + importer + .add( + "2", + Iv::new_ctr().as_slice().to_owned(), + DataKey { + key: key2.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + // importer is dropped here. + importer.commit().unwrap(); + assert_eq!(manager.get_file("1").unwrap(), file0); + assert_eq!(manager.get_file("2").unwrap().key, key2); } } diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index 601c982a961..8520e7a0cbe 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -8,6 +8,7 @@ use kvproto::encryptionpb::EncryptedContent; use tikv_util::{ box_err, error, stream::{retry, with_timeout}, + sys::thread::ThreadBuildWrapper, }; use tokio::runtime::{Builder, Runtime}; @@ -81,6 +82,8 @@ impl KmsBackend { Builder::new_current_thread() .thread_name("kms-runtime") .enable_all() + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build()?, ); @@ -121,17 +124,17 @@ impl KmsBackend { Ok(content) } - // On decrypt failure, the rule is to return WrongMasterKey error in case it is possible that - // a wrong master key has been used, or other error otherwise. + // On decrypt failure, the rule is to return WrongMasterKey error in case it is + // possible that a wrong master key has been used, or other error otherwise. fn decrypt_content(&self, content: &EncryptedContent) -> Result> { let vendor_name = self.kms_provider.name(); match content.metadata.get(MetadataKey::KmsVendor.as_str()) { Some(val) if val.as_slice() == vendor_name.as_bytes() => (), None => { return Err( - // If vender is missing in metadata, it could be the encrypted content is invalid - // or corrupted, but it is also possible that the content is encrypted using the - // FileBackend. Return WrongMasterKey anyway. + // If vender is missing in metadata, it could be the encrypted content is + // invalid or corrupted, but it is also possible that the content is encrypted + // using the FileBackend. Return WrongMasterKey anyway. Error::WrongMasterKey(box_err!("missing KMS vendor")), ); } diff --git a/components/encryption/src/master_key/mem.rs b/components/encryption/src/master_key/mem.rs index 92453dac5f2..8e65b85fff6 100644 --- a/components/encryption/src/master_key/mem.rs +++ b/components/encryption/src/master_key/mem.rs @@ -38,24 +38,25 @@ impl MemAesGcmBackend { Ok(content) } - // On decrypt failure, the rule is to return WrongMasterKey error in case it is possible that - // a wrong master key has been used, or other error otherwise. + // On decrypt failure, the rule is to return WrongMasterKey error in case it is + // possible that a wrong master key has been used, or other error otherwise. pub fn decrypt_content(&self, content: &EncryptedContent) -> Result> { let method = content .get_metadata() .get(MetadataKey::Method.as_str()) .ok_or_else(|| { - // Missing method in metadata. The metadata of the encrypted content is invalid or - // corrupted. + // Missing method in metadata. The metadata of the encrypted content is invalid + // or corrupted. Error::Other(box_err!( "metadata {} not found", MetadataKey::Method.as_str() )) })?; if method.as_slice() != MetadataMethod::Aes256Gcm.as_slice() { - // Currently we only support aes256-gcm. A different method could mean the encrypted - // content is written by a future version of TiKV, and we don't know how to handle it. - // Fail immediately instead of fallback to previous key. + // Currently we only support aes256-gcm. A different method could mean the + // encrypted content is written by a future version of TiKV, and we + // don't know how to handle it. Fail immediately instead of fallback + // to previous key. return Err(Error::Other(box_err!( "encryption method mismatch, expected {:?} vs actual {:?}", MetadataMethod::Aes256Gcm.as_slice(), @@ -75,7 +76,8 @@ impl MemAesGcmBackend { .get_metadata() .get(MetadataKey::AesGcmTag.as_str()) .ok_or_else(|| { - // Tag is missing. The metadata of the encrypted content is invalid or corrupted. + // Tag is missing. The metadata of the encrypted content is invalid or + // corrupted. Error::Other(box_err!("gcm tag not found")) })?; let gcm_tag = AesGcmTag::from(tag.as_slice()); diff --git a/components/encryption/src/master_key/metadata.rs b/components/encryption/src/master_key/metadata.rs index 8537a2416e3..38518cf0b34 100644 --- a/components/encryption/src/master_key/metadata.rs +++ b/components/encryption/src/master_key/metadata.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, PartialEq)] pub enum MetadataKey { Method, Iv, @@ -27,7 +27,7 @@ impl MetadataKey { } } -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, PartialEq)] pub enum MetadataMethod { Plaintext, Aes256Gcm, diff --git a/components/encryption/src/master_key/mod.rs b/components/encryption/src/master_key/mod.rs index f975e1de7b9..59578a2bcf0 100644 --- a/components/encryption/src/master_key/mod.rs +++ b/components/encryption/src/master_key/mod.rs @@ -106,8 +106,9 @@ pub mod tests { } impl MockBackend { - // Callers are responsible for enabling tracking on the MockBackend by calling this function - // This names the backend instance, allowiing later fine-grained recall + // Callers are responsible for enabling tracking on the MockBackend by calling + // this function This names the backend instance, allowing later fine-grained + // recall pub fn track(&mut self, name: String) { let track = make_track(&name); self.track = track.clone(); diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index 36f9b92ec24..ec77e2b715f 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -5,11 +5,15 @@ description = "An example TiKV storage engine that does nothing but panic" edition = "2018" publish = false +[features] +testexport = [] + [dependencies] -engine_traits = { path = "../engine_traits", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +engine_traits = { workspace = true } +kvproto = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } # FIXME: Remove this dep from the engine_traits interface -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } diff --git a/components/engine_panic/src/cf_names.rs b/components/engine_panic/src/cf_names.rs index 8697634586b..ee71210f229 100644 --- a/components/engine_panic/src/cf_names.rs +++ b/components/engine_panic/src/cf_names.rs @@ -1,10 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CFNamesExt; +use engine_traits::CfNamesExt; use crate::engine::PanicEngine; -impl CFNamesExt for PanicEngine { +impl CfNamesExt for PanicEngine { fn cf_names(&self) -> Vec<&str> { panic!() } diff --git a/components/engine_panic/src/cf_options.rs b/components/engine_panic/src/cf_options.rs index 918185b8183..cd4f7ee82d5 100644 --- a/components/engine_panic/src/cf_options.rs +++ b/components/engine_panic/src/cf_options.rs @@ -1,13 +1,13 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CFOptionsExt, ColumnFamilyOptions, Result, SstPartitionerFactory}; +use engine_traits::{CfOptions, CfOptionsExt, Result, SstPartitionerFactory}; -use crate::{db_options::PanicTitanDBOptions, engine::PanicEngine}; +use crate::{db_options::PanicTitanDbOptions, engine::PanicEngine}; -impl CFOptionsExt for PanicEngine { - type ColumnFamilyOptions = PanicColumnFamilyOptions; +impl CfOptionsExt for PanicEngine { + type CfOptions = PanicCfOptions; - fn get_options_cf(&self, cf: &str) -> Result { + fn get_options_cf(&self, cf: &str) -> Result { panic!() } fn set_options_cf(&self, cf: &str, options: &[(&str, &str)]) -> Result<()> { @@ -15,10 +15,10 @@ impl CFOptionsExt for PanicEngine { } } -pub struct PanicColumnFamilyOptions; +pub struct PanicCfOptions; -impl ColumnFamilyOptions for PanicColumnFamilyOptions { - type TitanDBOptions = PanicTitanDBOptions; +impl CfOptions for PanicCfOptions { + type TitanCfOptions = PanicTitanDbOptions; fn new() -> Self { panic!() @@ -26,10 +26,10 @@ impl ColumnFamilyOptions for PanicColumnFamilyOptions { fn get_max_write_buffer_number(&self) -> u32 { panic!() } - fn get_level_zero_slowdown_writes_trigger(&self) -> u32 { + fn get_level_zero_slowdown_writes_trigger(&self) -> i32 { panic!() } - fn get_level_zero_stop_writes_trigger(&self) -> u32 { + fn get_level_zero_stop_writes_trigger(&self) -> i32 { panic!() } fn set_level_zero_file_num_compaction_trigger(&mut self, v: i32) { @@ -44,10 +44,10 @@ impl ColumnFamilyOptions for PanicColumnFamilyOptions { fn get_block_cache_capacity(&self) -> u64 { panic!() } - fn set_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + fn set_block_cache_capacity(&self, capacity: u64) -> Result<()> { panic!() } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_titan_cf_options(&mut self, opts: &Self::TitanCfOptions) { panic!() } fn get_target_file_size_base(&self) -> u64 { diff --git a/components/engine_panic/src/checkpoint.rs b/components/engine_panic/src/checkpoint.rs new file mode 100644 index 00000000000..bed49c8e55b --- /dev/null +++ b/components/engine_panic/src/checkpoint.rs @@ -0,0 +1,33 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use core::panic; +use std::path::Path; + +use engine_traits::{Checkpointable, Checkpointer, Result}; + +use crate::PanicEngine; + +pub struct PanicCheckpointer {} + +impl Checkpointable for PanicEngine { + type Checkpointer = PanicCheckpointer; + + fn new_checkpointer(&self) -> Result { + panic!() + } + + fn merge(&self, dbs: &[&Self]) -> Result<()> { + panic!() + } +} + +impl Checkpointer for PanicCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + panic!() + } +} diff --git a/components/engine_panic/src/compact.rs b/components/engine_panic/src/compact.rs index f1e78d57010..988bec790de 100644 --- a/components/engine_panic/src/compact.rs +++ b/components/engine_panic/src/compact.rs @@ -13,7 +13,7 @@ impl CompactExt for PanicEngine { panic!() } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -24,15 +24,6 @@ impl CompactExt for PanicEngine { panic!() } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - panic!() - } - fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index f28741ce4c2..c081a5c1d12 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -1,13 +1,13 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DBOptions, DBOptionsExt, Result, TitanDBOptions}; +use engine_traits::{DbOptions, DbOptionsExt, Result, TitanCfOptions}; use crate::engine::PanicEngine; -impl DBOptionsExt for PanicEngine { - type DBOptions = PanicDBOptions; +impl DbOptionsExt for PanicEngine { + type DbOptions = PanicDbOptions; - fn get_db_options(&self) -> Self::DBOptions { + fn get_db_options(&self) -> Self::DbOptions { panic!() } fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()> { @@ -15,10 +15,10 @@ impl DBOptionsExt for PanicEngine { } } -pub struct PanicDBOptions; +pub struct PanicDbOptions; -impl DBOptions for PanicDBOptions { - type TitanDBOptions = PanicTitanDBOptions; +impl DbOptions for PanicDbOptions { + type TitanDbOptions = PanicTitanDbOptions; fn new() -> Self { panic!() @@ -44,14 +44,22 @@ impl DBOptions for PanicDBOptions { panic!() } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_flush_size(&mut self, f: usize) -> Result<()> { + panic!() + } + + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { + panic!() + } + + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { panic!() } } -pub struct PanicTitanDBOptions; +pub struct PanicTitanDbOptions; -impl TitanDBOptions for PanicTitanDBOptions { +impl TitanCfOptions for PanicTitanDbOptions { fn new() -> Self { panic!() } diff --git a/components/engine_panic/src/db_vector.rs b/components/engine_panic/src/db_vector.rs index 83d615dbc4c..3daf6dc9500 100644 --- a/components/engine_panic/src/db_vector.rs +++ b/components/engine_panic/src/db_vector.rs @@ -2,14 +2,14 @@ use std::ops::Deref; -use engine_traits::DBVector; +use engine_traits::DbVector; #[derive(Debug)] -pub struct PanicDBVector; +pub struct PanicDbVector; -impl DBVector for PanicDBVector {} +impl DbVector for PanicDbVector {} -impl Deref for PanicDBVector { +impl Deref for PanicDbVector { type Target = [u8]; fn deref(&self) -> &[u8] { @@ -17,7 +17,7 @@ impl Deref for PanicDBVector { } } -impl<'a> PartialEq<&'a [u8]> for PanicDBVector { +impl<'a> PartialEq<&'a [u8]> for PanicDbVector { fn eq(&self, rhs: &&[u8]) -> bool { **rhs == **self } diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index 33c7bc01541..d8faf8fee01 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -1,11 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{ - IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SeekKey, SyncMutable, + IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SyncMutable, WriteOptions, }; -use crate::{db_vector::PanicDBVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; +use crate::{db_vector::PanicDbVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; #[derive(Clone, Debug)] pub struct PanicEngine; @@ -22,12 +22,16 @@ impl KvEngine for PanicEngine { fn bad_downcast(&self) -> &T { panic!() } + #[cfg(any(test, feature = "testexport"))] + fn inner_refcount(&self) -> usize { + panic!() + } } impl Peekable for PanicEngine { - type DBVector = PanicDBVector; + type DbVector = PanicDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { panic!() } fn get_value_cf_opt( @@ -35,7 +39,7 @@ impl Peekable for PanicEngine { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { panic!() } } @@ -65,10 +69,7 @@ impl SyncMutable for PanicEngine { impl Iterable for PanicEngine { type Iterator = PanicEngineIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - panic!() - } - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { panic!() } } @@ -76,10 +77,18 @@ impl Iterable for PanicEngine { pub struct PanicEngineIterator; impl Iterator for PanicEngineIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { + fn seek(&mut self, key: &[u8]) -> Result { + panic!() + } + fn seek_for_prev(&mut self, key: &[u8]) -> Result { panic!() } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { + + fn seek_to_first(&mut self) -> Result { + panic!() + } + + fn seek_to_last(&mut self) -> Result { panic!() } diff --git a/components/engine_panic/src/lib.rs b/components/engine_panic/src/lib.rs index 761b31af1d8..93555f5ba5f 100644 --- a/components/engine_panic/src/lib.rs +++ b/components/engine_panic/src/lib.rs @@ -45,5 +45,6 @@ pub mod flow_control_factors; pub use crate::flow_control_factors::*; pub mod table_properties; pub use crate::table_properties::*; +pub mod checkpoint; mod raft_engine; diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 9a5cc310fc3..5603bf43c77 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -1,15 +1,33 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DeleteStrategy, MiscExt, Range, Result}; +use engine_traits::{DeleteStrategy, MiscExt, Range, Result, StatisticsReporter}; use crate::engine::PanicEngine; +pub struct PanicReporter; + +impl StatisticsReporter for PanicReporter { + fn new(name: &str) -> Self { + panic!() + } + + fn collect(&mut self, engine: &PanicEngine) { + panic!() + } + + fn flush(&mut self) { + panic!() + } +} + impl MiscExt for PanicEngine { - fn flush(&self, sync: bool) -> Result<()> { + type StatisticsReporter = PanicReporter; + + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { panic!() } - fn flush_cf(&self, cf: &str, sync: bool) -> Result<()> { + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()> { panic!() } @@ -30,11 +48,11 @@ impl MiscExt for PanicEngine { panic!() } - fn get_engine_used_size(&self) -> Result { + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { panic!() } - fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()> { + fn get_engine_used_size(&self) -> Result { panic!() } @@ -46,10 +64,22 @@ impl MiscExt for PanicEngine { panic!() } + fn pause_background_work(&self) -> Result<()> { + panic!() + } + + fn continue_background_work(&self) -> Result<()> { + panic!() + } + fn exists(path: &str) -> bool { panic!() } + fn locked(path: &str) -> Result { + panic!() + } + fn dump_stats(&self) -> Result { panic!() } @@ -66,6 +96,10 @@ impl MiscExt for PanicEngine { panic!() } + fn get_num_keys(&self) -> Result { + panic!() + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_panic/src/perf_context.rs b/components/engine_panic/src/perf_context.rs index 654ac01a629..27bdd1ac066 100644 --- a/components/engine_panic/src/perf_context.rs +++ b/components/engine_panic/src/perf_context.rs @@ -1,13 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{PerfContext, PerfContextExt, PerfContextKind, PerfLevel}; +use tracker::TrackerToken; use crate::engine::PanicEngine; impl PerfContextExt for PanicEngine { type PerfContext = PanicPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { panic!() } } @@ -19,7 +20,7 @@ impl PerfContext for PanicPerfContext { panic!() } - fn report_metrics(&mut self) { + fn report_metrics(&mut self, _: &[TrackerToken]) { panic!() } } diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 9842e1100ed..c0539c1edd5 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -1,7 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{Error, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, Result}; -use kvproto::raft_serverpb::RaftLocalState; +use kvproto::{ + metapb::Region, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, +}; use raft::eraftpb::Entry; use crate::{engine::PanicEngine, write_batch::PanicWriteBatch}; @@ -29,6 +34,46 @@ impl RaftEngineReadOnly for PanicEngine { fn get_all_entries_to(&self, region_id: u64, buf: &mut Vec) -> Result<()> { panic!() } + + fn is_empty(&self) -> Result { + panic!() + } + + fn get_store_ident(&self) -> Result> { + panic!() + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + panic!() + } + + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result> { + panic!() + } + + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + panic!() + } + + fn get_recover_state(&self) -> Result> { + panic!() + } } impl RaftEngineDebug for PanicEngine { @@ -79,49 +124,59 @@ impl RaftEngine for PanicEngine { panic!() } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { panic!() } - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { panic!() } - fn gc(&self, raft_group_id: u64, mut from: u64, to: u64) -> Result { + fn need_manual_purge(&self) -> bool { panic!() } - fn purge_expired_files(&self) -> Result> { + fn manual_purge(&self) -> Result> { panic!() } - fn has_builtin_entry_cache(&self) -> bool { + fn flush_metrics(&self, instance: &str) { panic!() } - fn flush_metrics(&self, instance: &str) { + fn dump_stats(&self) -> Result { panic!() } - fn reset_statistics(&self) { + fn get_engine_size(&self) -> Result { panic!() } - fn dump_stats(&self) -> Result { + fn get_engine_path(&self) -> &str { panic!() } - fn get_engine_size(&self) -> Result { + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { panic!() } } impl RaftLogBatch for PanicWriteBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { - panic!() - } - - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { panic!() } @@ -140,4 +195,52 @@ impl RaftLogBatch for PanicWriteBatch { fn merge(&mut self, _: Self) -> Result<()> { panic!() } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + panic!() + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + panic!() + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + panic!() + } + + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { + panic!() + } + + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()> { + panic!() + } + + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + panic!() + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + panic!() + } } diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index c65dc560326..f6cda5312cb 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -3,32 +3,29 @@ use std::ops::Deref; use engine_traits::{ - IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, SeekKey, Snapshot, + CfNamesExt, IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot, }; -use crate::{db_vector::PanicDBVector, engine::PanicEngine}; +use crate::{db_vector::PanicDbVector, engine::PanicEngine}; #[derive(Clone, Debug)] pub struct PanicSnapshot; -impl Snapshot for PanicSnapshot { - fn cf_names(&self) -> Vec<&str> { - panic!() - } -} +impl Snapshot for PanicSnapshot {} impl Peekable for PanicSnapshot { - type DBVector = PanicDBVector; + type DbVector = PanicDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { panic!() } + fn get_value_cf_opt( &self, opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { panic!() } } @@ -36,10 +33,13 @@ impl Peekable for PanicSnapshot { impl Iterable for PanicSnapshot { type Iterator = PanicSnapshotIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { panic!() } - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { +} + +impl CfNamesExt for PanicSnapshot { + fn cf_names(&self) -> Vec<&str> { panic!() } } @@ -47,10 +47,18 @@ impl Iterable for PanicSnapshot { pub struct PanicSnapshotIterator; impl Iterator for PanicSnapshotIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { + fn seek(&mut self, key: &[u8]) -> Result { + panic!() + } + fn seek_for_prev(&mut self, key: &[u8]) -> Result { panic!() } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { + + fn seek_to_first(&mut self) -> Result { + panic!() + } + + fn seek_to_last(&mut self) -> Result { panic!() } diff --git a/components/engine_panic/src/sst.rs b/components/engine_panic/src/sst.rs index 64aa5666fe1..a0f1479604c 100644 --- a/components/engine_panic/src/sst.rs +++ b/components/engine_panic/src/sst.rs @@ -1,9 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::path::PathBuf; +use std::{marker::PhantomData, path::PathBuf}; use engine_traits::{ - CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SeekKey, + CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, RefIterable, Result, SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, }; @@ -24,29 +24,33 @@ impl SstReader for PanicSstReader { fn verify_checksum(&self) -> Result<()> { panic!() } - fn iter(&self) -> Self::Iterator { +} + +impl RefIterable for PanicSstReader { + type Iterator<'a> = PanicSstReaderIterator<'a>; + + fn iter(&self, opts: IterOptions) -> Result> { panic!() } } -impl Iterable for PanicSstReader { - type Iterator = PanicSstReaderIterator; +pub struct PanicSstReaderIterator<'a> { + _phantom: PhantomData<&'a ()>, +} - fn iterator_opt(&self, opts: IterOptions) -> Result { +impl Iterator for PanicSstReaderIterator<'_> { + fn seek(&mut self, key: &[u8]) -> Result { panic!() } - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn seek_for_prev(&mut self, key: &[u8]) -> Result { panic!() } -} - -pub struct PanicSstReaderIterator; -impl Iterator for PanicSstReaderIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { + fn seek_to_first(&mut self) -> Result { panic!() } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { + + fn seek_to_last(&mut self) -> Result { panic!() } diff --git a/components/engine_panic/src/write_batch.rs b/components/engine_panic/src/write_batch.rs index d2dc866ca31..5c7b1a30922 100644 --- a/components/engine_panic/src/write_batch.rs +++ b/components/engine_panic/src/write_batch.rs @@ -20,7 +20,7 @@ impl WriteBatchExt for PanicEngine { pub struct PanicWriteBatch; impl WriteBatch for PanicWriteBatch { - fn write_opt(&self, _: &WriteOptions) -> Result<()> { + fn write_opt(&mut self, _: &WriteOptions) -> Result { panic!() } diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 7d1a90d7afe..d31ed947520 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -5,10 +5,12 @@ edition = "2018" publish = false [features] +trace-lifetime = [] jemalloc = ["rocksdb/jemalloc"] portable = ["rocksdb/portable"] sse = ["rocksdb/sse"] failpoints = ["fail/failpoints"] +testexport = [] # Disables runtime checks of invariants required by RocksDB that are redundant # with assertions inside RocksDB itself. This makes it possible to test those @@ -23,20 +25,20 @@ failpoints = ["fail/failpoints"] nortcheck = [] [dependencies] -api_version = { path = "../api_version", default-features = false } -case_macros = { path = "../case_macros" } -collections = { path = "../collections", default-features = false } +api_version = { workspace = true } +case_macros = { workspace = true } +collections = { workspace = true } derive_more = "0.99.3" -encryption = { path = "../encryption", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +encryption = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +file_system = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.4.0" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } num_cpus = "1" -online_config = { path = "../online_config" } +online_config = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = "2" @@ -44,14 +46,15 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code regex = "1" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } slog_derive = "0.2" tempfile = "3.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } time = "0.1" -txn_types = { path = "../txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } [dependencies.rocksdb] git = "https://github.com/tikv/rust-rocksdb.git" diff --git a/components/engine_rocks/src/cf_names.rs b/components/engine_rocks/src/cf_names.rs index b45a3960328..3b2512d0def 100644 --- a/components/engine_rocks/src/cf_names.rs +++ b/components/engine_rocks/src/cf_names.rs @@ -1,10 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CFNamesExt; +use engine_traits::CfNamesExt; use crate::engine::RocksEngine; -impl CFNamesExt for RocksEngine { +impl CfNamesExt for RocksEngine { fn cf_names(&self) -> Vec<&str> { self.as_inner().cf_names() } diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index 49ba840bc00..f2cc46d7a30 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -1,20 +1,22 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CFOptionsExt, ColumnFamilyOptions, Result, SstPartitionerFactory}; -use rocksdb::ColumnFamilyOptions as RawCFOptions; +use std::ops::{Deref, DerefMut}; + +use engine_traits::{CfOptions, CfOptionsExt, Result, SstPartitionerFactory}; +use rocksdb::ColumnFamilyOptions as RawCfOptions; use tikv_util::box_err; use crate::{ - db_options::RocksTitanDBOptions, engine::RocksEngine, + db_options::RocksTitanDbOptions, engine::RocksEngine, r2e, sst_partitioner::RocksSstPartitionerFactory, util, }; -impl CFOptionsExt for RocksEngine { - type ColumnFamilyOptions = RocksColumnFamilyOptions; +impl CfOptionsExt for RocksEngine { + type CfOptions = RocksCfOptions; - fn get_options_cf(&self, cf: &str) -> Result { + fn get_options_cf(&self, cf: &str) -> Result { let handle = util::get_cf_handle(self.as_inner(), cf)?; - Ok(RocksColumnFamilyOptions::from_raw( + Ok(RocksCfOptions::from_raw( self.as_inner().get_options_cf(handle), )) } @@ -27,40 +29,52 @@ impl CFOptionsExt for RocksEngine { } } -#[derive(Clone)] -pub struct RocksColumnFamilyOptions(RawCFOptions); +#[derive(Default, Clone)] +pub struct RocksCfOptions(RawCfOptions); -impl RocksColumnFamilyOptions { - pub fn from_raw(raw: RawCFOptions) -> RocksColumnFamilyOptions { - RocksColumnFamilyOptions(raw) +impl RocksCfOptions { + pub fn from_raw(raw: RawCfOptions) -> RocksCfOptions { + RocksCfOptions(raw) } - pub fn into_raw(self) -> RawCFOptions { + pub fn into_raw(self) -> RawCfOptions { self.0 } +} + +impl Deref for RocksCfOptions { + type Target = RawCfOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} - pub fn as_raw_mut(&mut self) -> &mut RawCFOptions { +impl DerefMut for RocksCfOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } -impl ColumnFamilyOptions for RocksColumnFamilyOptions { - type TitanDBOptions = RocksTitanDBOptions; +impl CfOptions for RocksCfOptions { + type TitanCfOptions = RocksTitanDbOptions; fn new() -> Self { - RocksColumnFamilyOptions::from_raw(RawCFOptions::new()) + RocksCfOptions::from_raw(RawCfOptions::default()) } fn get_max_write_buffer_number(&self) -> u32 { self.0.get_max_write_buffer_number() } - fn get_level_zero_slowdown_writes_trigger(&self) -> u32 { - self.0.get_level_zero_slowdown_writes_trigger() + fn get_level_zero_slowdown_writes_trigger(&self) -> i32 { + self.0.get_level_zero_slowdown_writes_trigger() as i32 } - fn get_level_zero_stop_writes_trigger(&self) -> u32 { - self.0.get_level_zero_stop_writes_trigger() + fn get_level_zero_stop_writes_trigger(&self) -> i32 { + self.0.get_level_zero_stop_writes_trigger() as i32 } fn set_level_zero_file_num_compaction_trigger(&mut self, v: i32) { @@ -79,11 +93,11 @@ impl ColumnFamilyOptions for RocksColumnFamilyOptions { self.0.get_block_cache_capacity() } - fn set_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { - self.0.set_block_cache_capacity(capacity) + fn set_block_cache_capacity(&self, capacity: u64) -> Result<()> { + self.0.set_block_cache_capacity(capacity).map_err(r2e) } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_titan_cf_options(&mut self, opts: &Self::TitanCfOptions) { self.0.set_titandb_options(opts.as_raw()) } diff --git a/components/engine_rocks/src/checkpoint.rs b/components/engine_rocks/src/checkpoint.rs new file mode 100644 index 00000000000..0f86aa29945 --- /dev/null +++ b/components/engine_rocks/src/checkpoint.rs @@ -0,0 +1,63 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use engine_traits::{Checkpointable, Checkpointer, Result}; + +use crate::{r2e, RocksEngine}; + +impl Checkpointable for RocksEngine { + type Checkpointer = RocksEngineCheckpointer; + + fn new_checkpointer(&self) -> Result { + match self.as_inner().new_checkpointer() { + Ok(pointer) => Ok(RocksEngineCheckpointer(pointer)), + Err(e) => Err(r2e(e)), + } + } + + fn merge(&self, dbs: &[&Self]) -> Result<()> { + let mut mopts = rocksdb::MergeInstanceOptions::default(); + mopts.merge_memtable = false; + mopts.allow_source_write = true; + let inner: Vec<_> = dbs.iter().map(|e| e.as_inner().as_ref()).collect(); + self.as_inner().merge_instances(&mopts, &inner).map_err(r2e) + } +} + +pub struct RocksEngineCheckpointer(rocksdb::Checkpointer); + +impl Checkpointer for RocksEngineCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + self.0 + .create_at(db_out_dir, titan_out_dir, log_size_for_flush) + .map_err(|e| r2e(e)) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{Checkpointable, Checkpointer, Peekable, SyncMutable, ALL_CFS}; + use tempfile::tempdir; + + use crate::util::new_engine; + + #[test] + fn test_checkpoint() { + let dir = tempdir().unwrap(); + let path = dir.path().join("origin"); + let engine = new_engine(path.as_path().to_str().unwrap(), ALL_CFS).unwrap(); + engine.put(b"key", b"value").unwrap(); + + let mut check_pointer = engine.new_checkpointer().unwrap(); + let path2 = dir.path().join("checkpoint"); + check_pointer.create_at(path2.as_path(), None, 0).unwrap(); + let engine2 = new_engine(path2.as_path().to_str().unwrap(), ALL_CFS).unwrap(); + assert_eq!(engine2.get_value(b"key").unwrap().unwrap(), b"value"); + } +} diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 05369015a1e..199b7d9f3be 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -2,10 +2,10 @@ use std::cmp; -use engine_traits::{CFNamesExt, CompactExt, Result}; +use engine_traits::{CfNamesExt, CompactExt, Result}; use rocksdb::{CompactOptions, CompactionOptions, DBCompressionType}; -use crate::{engine::RocksEngine, util}; +use crate::{engine::RocksEngine, r2e, util}; impl CompactExt for RocksEngine { type CompactedEvent = crate::compact_listener::RocksCompactedEvent; @@ -24,7 +24,7 @@ impl CompactExt for RocksEngine { Ok(false) } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -43,18 +43,6 @@ impl CompactExt for RocksEngine { Ok(()) } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - for cf_name in self.cf_names() { - self.compact_files_in_range_cf(cf_name, start, end, output_level)?; - } - Ok(()) - } - fn compact_files_in_range_cf( &self, cf: &str, @@ -130,23 +118,17 @@ impl CompactExt for RocksEngine { opts.set_max_subcompactions(max_subcompactions as i32); opts.set_output_file_size_limit(output_file_size_limit); - db.compact_files_cf(handle, &opts, &files, output_level)?; - Ok(()) + db.compact_files_cf(handle, &opts, &files, output_level) + .map_err(r2e) } } #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_traits::CompactExt; - use rocksdb::{ColumnFamilyOptions, Writable}; + use engine_traits::{CfNamesExt, CfOptionsExt, CompactExt, MiscExt, SyncMutable}; use tempfile::Builder; - use crate::{ - raw_util::{new_engine, CFOptions}, - Compat, - }; + use crate::{util, RocksCfOptions, RocksDbOptions}; #[test] fn test_compact_files_in_range() { @@ -155,29 +137,24 @@ mod tests { .tempdir() .unwrap(); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_disable_auto_compactions(true); - let cfs_opts = vec![ - CFOptions::new("default", cf_opts.clone()), - CFOptions::new("test", cf_opts), - ]; - let db = new_engine( + let cfs_opts = vec![("default", cf_opts.clone()), ("test", cf_opts)]; + let db = util::new_engine_opt( temp_dir.path().to_str().unwrap(), - None, - &["default", "test"], - Some(cfs_opts), + RocksDbOptions::default(), + cfs_opts, ) .unwrap(); - let db = Arc::new(db); for cf_name in db.cf_names() { - let cf = db.cf_handle(cf_name).unwrap(); for i in 0..5 { - db.put_cf(cf, &[i], &[i]).unwrap(); - db.put_cf(cf, &[i + 1], &[i + 1]).unwrap(); - db.flush_cf(cf, true).unwrap(); + db.put_cf(cf_name, &[i], &[i]).unwrap(); + db.put_cf(cf_name, &[i + 1], &[i + 1]).unwrap(); + db.flush_cf(cf_name, true).unwrap(); } - let cf_meta = db.get_column_family_meta_data(cf); + let cf = util::get_cf_handle(db.as_inner(), cf_name).unwrap(); + let cf_meta = db.as_inner().get_column_family_meta_data(cf); let cf_levels = cf_meta.get_levels(); assert_eq!(cf_levels.first().unwrap().get_files().len(), 5); } @@ -187,13 +164,12 @@ mod tests { // # After // Level-0: [4-5] // Level-1: [0-4] - db.c() - .compact_files_in_range(None, Some(&[4]), Some(1)) + db.compact_files_in_range(None, Some(&[4]), Some(1)) .unwrap(); for cf_name in db.cf_names() { - let cf = db.cf_handle(cf_name).unwrap(); - let cf_meta = db.get_column_family_meta_data(cf); + let cf = util::get_cf_handle(db.as_inner(), cf_name).unwrap(); + let cf_meta = db.as_inner().get_column_family_meta_data(cf); let cf_levels = cf_meta.get_levels(); let level_0 = cf_levels[0].get_files(); assert_eq!(level_0.len(), 1); @@ -211,14 +187,13 @@ mod tests { // # After // Level-0: [4-5] // Level-N: [0-4] - db.c() - .compact_files_in_range(Some(&[2]), Some(&[4]), None) + db.compact_files_in_range(Some(&[2]), Some(&[4]), None) .unwrap(); for cf_name in db.cf_names() { - let cf = db.cf_handle(cf_name).unwrap(); - let cf_opts = db.get_options_cf(cf); - let cf_meta = db.get_column_family_meta_data(cf); + let cf = util::get_cf_handle(db.as_inner(), cf_name).unwrap(); + let cf_opts = db.get_options_cf(cf_name).unwrap(); + let cf_meta = db.as_inner().get_column_family_meta_data(cf); let cf_levels = cf_meta.get_levels(); let level_0 = cf_levels[0].get_files(); assert_eq!(level_0.len(), 1); @@ -229,26 +204,5 @@ mod tests { assert_eq!(level_n[0].get_smallestkey(), &[0]); assert_eq!(level_n[0].get_largestkey(), &[4]); } - - for cf_name in db.cf_names() { - let mut files = vec![]; - let cf = db.cf_handle(cf_name).unwrap(); - let cf_meta = db.get_column_family_meta_data(cf); - let cf_levels = cf_meta.get_levels(); - - for level in cf_levels.into_iter().rev() { - files.extend(level.get_files().iter().map(|f| f.get_name())); - } - - assert_eq!(files.len(), 2); - db.c() - .compact_files_cf(cf_name, files.clone(), Some(3), 0, true) - .unwrap(); - - let cf_meta = db.get_column_family_meta_data(cf); - let cf_levels = cf_meta.get_levels(); - assert_eq!(cf_levels[0].get_files().len(), 1); - assert_eq!(cf_levels[3].get_files().len(), 1); - } } } diff --git a/components/engine_rocks/src/compact_listener.rs b/components/engine_rocks/src/compact_listener.rs index 0affe70dd4b..e679410c8b9 100644 --- a/components/engine_rocks/src/compact_listener.rs +++ b/components/engine_rocks/src/compact_listener.rs @@ -7,6 +7,7 @@ use std::{ Bound::{Excluded, Included, Unbounded}, }, path::Path, + sync::Arc, }; use collections::hash_set_with_capacity; @@ -16,10 +17,7 @@ use rocksdb::{ }; use tikv_util::warn; -use crate::{ - properties::{RangeProperties, UserCollectedPropertiesDecoder}, - raw::EventListener, -}; +use crate::properties::{RangeProperties, UserCollectedPropertiesDecoder}; pub struct RocksCompactionJobInfo<'a>(&'a RawCompactionJobInfo); @@ -199,27 +197,36 @@ impl CompactedEvent for RocksCompactedEvent { } fn cf(&self) -> &str { - &*self.cf + &self.cf } } pub type Filter = fn(&RocksCompactionJobInfo<'_>) -> bool; +/// The trait for sending RocksCompactedEvent event +/// This is to workaround Box cannot be cloned +pub trait CompactedEventSender { + fn send(&self, event: RocksCompactedEvent); +} + pub struct CompactionListener { - ch: Box, + event_sender: Arc, filter: Option, } impl CompactionListener { pub fn new( - ch: Box, + event_sender: Arc, filter: Option, ) -> CompactionListener { - CompactionListener { ch, filter } + CompactionListener { + event_sender, + filter, + } } } -impl EventListener for CompactionListener { +impl rocksdb::EventListener for CompactionListener { fn on_compaction_completed(&self, info: &RawCompactionJobInfo) { let info = &RocksCompactionJobInfo::from_raw(info); if info.status().is_err() { @@ -288,7 +295,7 @@ impl EventListener for CompactionListener { return; } - (self.ch)(RocksCompactedEvent::new( + self.event_sender.send(RocksCompactedEvent::new( info, smallest_key.unwrap(), largest_key.unwrap(), diff --git a/components/engine_rocks/src/compat.rs b/components/engine_rocks/src/compat.rs deleted file mode 100644 index 96371fcf62b..00000000000 --- a/components/engine_rocks/src/compat.rs +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use crate::{engine::RocksEngine, raw::DB}; - -/// A trait to enter the world of engine traits from a raw `Arc` -/// with as little syntax as possible. -/// -/// This will be used during the transition from RocksDB to the -/// `KvEngine` abstraction and then discarded. -pub trait Compat { - type Other; - - fn c(&self) -> &Self::Other; -} - -impl Compat for Arc { - type Other = RocksEngine; - - #[inline] - fn c(&self) -> &RocksEngine { - RocksEngine::from_ref(self) - } -} diff --git a/components/engine_rocks/src/config.rs b/components/engine_rocks/src/config.rs index 6442a5dab64..e121a1cea18 100644 --- a/components/engine_rocks/src/config.rs +++ b/components/engine_rocks/src/config.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::str::FromStr; +use std::{convert::TryFrom, str::FromStr}; use online_config::ConfigValue; use rocksdb::{ @@ -215,6 +215,120 @@ pub mod compression_type_serde { } } +pub mod checksum_serde { + use std::fmt; + + use rocksdb::ChecksumType; + use serde::{ + de::{Error, Unexpected, Visitor}, + Deserializer, Serializer, + }; + + pub fn serialize(t: &ChecksumType, serializer: S) -> Result + where + S: Serializer, + { + let name = match *t { + ChecksumType::NoChecksum => "no", + ChecksumType::CRC32c => "crc32c", + ChecksumType::XxHash => "xxhash", + ChecksumType::XxHash64 => "xxhash64", + ChecksumType::XXH3 => "xxh3", + }; + serializer.serialize_str(name) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct StrVistor; + impl<'de> Visitor<'de> for StrVistor { + type Value = ChecksumType; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(formatter, "a checksum type") + } + + fn visit_str(self, value: &str) -> Result + where + E: Error, + { + let str = match &*value.trim().to_lowercase() { + "no" => ChecksumType::NoChecksum, + "crc32c" => ChecksumType::CRC32c, + "xxhash" => ChecksumType::XxHash, + "xxhash64" => ChecksumType::XxHash64, + "xxh3" => ChecksumType::XXH3, + _ => { + return Err(E::invalid_value( + Unexpected::Other("invalid checksum type"), + &self, + )); + } + }; + Ok(str) + } + } + + deserializer.deserialize_str(StrVistor) + } +} + +pub mod prepopulate_block_cache_serde { + use std::fmt; + + use rocksdb::PrepopulateBlockCache; + use serde::{ + de::{Error, Unexpected, Visitor}, + Deserializer, Serializer, + }; + + pub fn serialize(t: &PrepopulateBlockCache, serializer: S) -> Result + where + S: Serializer, + { + let name = match *t { + PrepopulateBlockCache::Disabled => "disabled", + PrepopulateBlockCache::FlushOnly => "flush-only", + }; + serializer.serialize_str(name) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct StrVistor; + impl<'de> Visitor<'de> for StrVistor { + type Value = PrepopulateBlockCache; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(formatter, "a prepopulate block cache mode") + } + + fn visit_str(self, value: &str) -> Result + where + E: Error, + { + let str = match &*value.trim().to_lowercase() { + "disabled" => PrepopulateBlockCache::Disabled, + "flush-only" => PrepopulateBlockCache::FlushOnly, + _ => { + return Err(E::invalid_value( + Unexpected::Other("invalid prepopulate block cache mode"), + &self, + )); + } + }; + Ok(str) + } + } + + deserializer.deserialize_str(StrVistor) + } +} + #[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] pub enum BlobRunMode { @@ -225,21 +339,22 @@ pub enum BlobRunMode { impl From for ConfigValue { fn from(mode: BlobRunMode) -> ConfigValue { - ConfigValue::BlobRunMode(format!("k{:?}", mode)) + let str_value = match mode { + BlobRunMode::Normal => "normal", + BlobRunMode::ReadOnly => "read-only", + BlobRunMode::Fallback => "fallback", + }; + ConfigValue::String(str_value.into()) } } -impl From for BlobRunMode { - fn from(c: ConfigValue) -> BlobRunMode { - if let ConfigValue::BlobRunMode(s) = c { - match s.as_str() { - "kNormal" => BlobRunMode::Normal, - "kReadOnly" => BlobRunMode::ReadOnly, - "kFallback" => BlobRunMode::Fallback, - m => panic!("expect: kNormal, kReadOnly or kFallback, got: {:?}", m), - } +impl TryFrom for BlobRunMode { + type Error = String; + fn try_from(c: ConfigValue) -> Result { + if let ConfigValue::String(s) = c { + Self::from_str(&s) } else { - panic!("expect: ConfigValue::BlobRunMode, got: {:?}", c); + panic!("expect: ConfigValue::String, got: {:?}", c); } } } diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index 948ed469352..c9ef2cfda98 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -1,16 +1,18 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DBOptions, DBOptionsExt, Result, TitanDBOptions}; +use std::ops::{Deref, DerefMut}; + +use engine_traits::{DbOptions, DbOptionsExt, Result, TitanCfOptions}; use rocksdb::{DBOptions as RawDBOptions, TitanDBOptions as RawTitanDBOptions}; use tikv_util::box_err; use crate::engine::RocksEngine; -impl DBOptionsExt for RocksEngine { - type DBOptions = RocksDBOptions; +impl DbOptionsExt for RocksEngine { + type DbOptions = RocksDbOptions; - fn get_db_options(&self) -> Self::DBOptions { - RocksDBOptions::from_raw(self.as_inner().get_db_options()) + fn get_db_options(&self) -> Self::DbOptions { + RocksDbOptions::from_raw(self.as_inner().get_db_options()) } fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()> { self.as_inner() @@ -19,11 +21,12 @@ impl DBOptionsExt for RocksEngine { } } -pub struct RocksDBOptions(RawDBOptions); +#[derive(Default)] +pub struct RocksDbOptions(RawDBOptions); -impl RocksDBOptions { - pub fn from_raw(raw: RawDBOptions) -> RocksDBOptions { - RocksDBOptions(raw) +impl RocksDbOptions { + pub fn from_raw(raw: RawDBOptions) -> RocksDbOptions { + RocksDbOptions(raw) } pub fn into_raw(self) -> RawDBOptions { @@ -35,11 +38,27 @@ impl RocksDBOptions { } } -impl DBOptions for RocksDBOptions { - type TitanDBOptions = RocksTitanDBOptions; +impl Deref for RocksDbOptions { + type Target = RawDBOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RocksDbOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl DbOptions for RocksDbOptions { + type TitanDbOptions = RocksTitanDbOptions; fn new() -> Self { - RocksDBOptions::from_raw(RawDBOptions::new()) + RocksDbOptions::from_raw(RawDBOptions::new()) } fn get_max_background_jobs(&self) -> i32 { @@ -47,35 +66,59 @@ impl DBOptions for RocksDBOptions { } fn get_rate_bytes_per_sec(&self) -> Option { - self.0.get_rate_bytes_per_sec() + self.0.get_rate_limiter().map(|r| r.get_bytes_per_second()) } fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()> { - self.0 - .set_rate_bytes_per_sec(rate_bytes_per_sec) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_bytes_per_second(rate_bytes_per_sec); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn get_rate_limiter_auto_tuned(&self) -> Option { - self.0.get_auto_tuned() + self.0.get_rate_limiter().map(|r| r.get_auto_tuned()) } fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { - self.0 - .set_auto_tuned(rate_limiter_auto_tuned) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_auto_tuned(rate_limiter_auto_tuned); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_oldest_first(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } } -pub struct RocksTitanDBOptions(RawTitanDBOptions); +pub struct RocksTitanDbOptions(RawTitanDBOptions); -impl RocksTitanDBOptions { - pub fn from_raw(raw: RawTitanDBOptions) -> RocksTitanDBOptions { - RocksTitanDBOptions(raw) +impl RocksTitanDbOptions { + pub fn from_raw(raw: RawTitanDBOptions) -> RocksTitanDbOptions { + RocksTitanDbOptions(raw) } pub fn as_raw(&self) -> &RawTitanDBOptions { @@ -83,9 +126,25 @@ impl RocksTitanDBOptions { } } -impl TitanDBOptions for RocksTitanDBOptions { +impl Deref for RocksTitanDbOptions { + type Target = RawTitanDBOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RocksTitanDbOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl TitanCfOptions for RocksTitanDbOptions { fn new() -> Self { - RocksTitanDBOptions::from_raw(RawTitanDBOptions::new()) + RocksTitanDbOptions::from_raw(RawTitanDBOptions::new()) } fn set_min_blob_size(&mut self, size: u64) { diff --git a/components/engine_rocks/src/db_vector.rs b/components/engine_rocks/src/db_vector.rs index cf48bd8da0e..97fa65b7072 100644 --- a/components/engine_rocks/src/db_vector.rs +++ b/components/engine_rocks/src/db_vector.rs @@ -5,20 +5,20 @@ use std::{ ops::Deref, }; -use engine_traits::DBVector; +use engine_traits::DbVector; use rocksdb::DBVector as RawDBVector; -pub struct RocksDBVector(RawDBVector); +pub struct RocksDbVector(RawDBVector); -impl RocksDBVector { - pub fn from_raw(raw: RawDBVector) -> RocksDBVector { - RocksDBVector(raw) +impl RocksDbVector { + pub fn from_raw(raw: RawDBVector) -> RocksDbVector { + RocksDbVector(raw) } } -impl DBVector for RocksDBVector {} +impl DbVector for RocksDbVector {} -impl Deref for RocksDBVector { +impl Deref for RocksDbVector { type Target = [u8]; fn deref(&self) -> &[u8] { @@ -26,13 +26,13 @@ impl Deref for RocksDBVector { } } -impl Debug for RocksDBVector { +impl Debug for RocksDbVector { fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { write!(formatter, "{:?}", &**self) } } -impl<'a> PartialEq<&'a [u8]> for RocksDBVector { +impl<'a> PartialEq<&'a [u8]> for RocksDbVector { fn eq(&self, rhs: &&[u8]) -> bool { **rhs == **self } diff --git a/components/engine_rocks/src/encryption.rs b/components/engine_rocks/src/encryption.rs index a8ec54673b3..3caf07a0276 100644 --- a/components/engine_rocks/src/encryption.rs +++ b/components/engine_rocks/src/encryption.rs @@ -9,19 +9,19 @@ use rocksdb::{ FileEncryptionInfo as DBFileEncryptionInfo, }; -use crate::raw::Env; +use crate::{r2e, raw::Env}; // Use engine::Env directly since Env is not abstracted. pub(crate) fn get_env( base_env: Option>, key_manager: Option>, -) -> std::result::Result, String> { +) -> engine_traits::Result> { let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); if let Some(manager) = key_manager { - Ok(Arc::new(Env::new_key_managed_encrypted_env( - base_env, - WrappedEncryptionKeyManager { manager }, - )?)) + Ok(Arc::new( + Env::new_key_managed_encrypted_env(base_env, WrappedEncryptionKeyManager { manager }) + .map_err(r2e)?, + )) } else { Ok(base_env) } @@ -64,6 +64,7 @@ fn convert_encryption_method(input: EncryptionMethod) -> DBEncryptionMethod { EncryptionMethod::Aes128Ctr => DBEncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr => DBEncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr => DBEncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr => DBEncryptionMethod::Sm4Ctr, EncryptionMethod::Unknown => DBEncryptionMethod::Unknown, } } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 32bd259f160..6c6231ca42f 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -1,44 +1,166 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{any::Any, fs, path::Path, sync::Arc}; +use std::{any::Any, sync::Arc}; -use engine_traits::{ - Error, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, -}; +use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ - db_vector::RocksDBVector, - options::RocksReadOptions, - rocks_metrics::{ - flush_engine_histogram_metrics, flush_engine_iostall_properties, flush_engine_properties, - flush_engine_ticker_metrics, - }, - rocks_metrics_defs::{ - ENGINE_HIST_TYPES, ENGINE_TICKER_TYPES, TITAN_ENGINE_HIST_TYPES, TITAN_ENGINE_TICKER_TYPES, - }, - util::get_cf_handle, + db_vector::RocksDbVector, options::RocksReadOptions, r2e, util::get_cf_handle, RocksEngineIterator, RocksSnapshot, }; +#[cfg(feature = "trace-lifetime")] +mod trace { + //! Trace tools for tablets. + //! + //! It's hard to know who is holding the rocksdb reference when trying to + //! debug why the tablet is not deleted. The module will record the + //! backtrace and thread name when the tablet is created or clone. So + //! after print all the backtrace, we can easily figure out who is + //! leaking the tablet. + //! + //! To use the feature, you need to compile tikv-server with + //! trace-tabelt-lifetime feature. For example, `env + //! ENABLE_FEATURES=trace-tablet-lifetime make release`. And then query the trace information by `curl http://ip:status_port/region/id?trace-tablet=1`. + + use std::{ + backtrace::Backtrace, + collections::BTreeMap, + ops::Bound::Included, + sync::{ + atomic::{AtomicU64, Ordering}, + Mutex, + }, + }; + + use rocksdb::DB; + + static CNT: AtomicU64 = AtomicU64::new(0); + + fn inc_id() -> u64 { + CNT.fetch_add(1, Ordering::Relaxed) + } + + struct BacktraceInfo { + bt: Backtrace, + name: String, + } + + impl BacktraceInfo { + fn default() -> Self { + BacktraceInfo { + bt: Backtrace::force_capture(), + name: std::thread::current().name().unwrap_or("").to_string(), + } + } + } + + #[derive(PartialEq, PartialOrd, Eq, Ord, Clone, Copy, Default, Debug)] + struct TabletTraceKey { + region_id: u64, + suffix: u64, + addr: u64, + alloc_id: u64, + } + + lazy_static::lazy_static! { + static ref TABLET_TRACE: Mutex> = Mutex::new(BTreeMap::default()); + } + + pub fn list(id: u64) -> Vec { + let min = TabletTraceKey { + region_id: id, + suffix: 0, + addr: 0, + alloc_id: 0, + }; + let max = TabletTraceKey { + region_id: id, + suffix: u64::MAX, + addr: u64::MAX, + alloc_id: u64::MAX, + }; + let traces = TABLET_TRACE.lock().unwrap(); + traces + .range((Included(min), Included(max))) + .map(|(k, v)| { + format!( + "{}_{} {} {} {}", + k.region_id, k.suffix, k.addr, v.name, v.bt + ) + }) + .collect() + } + + #[derive(Debug)] + pub struct TabletTraceId(TabletTraceKey); + + impl TabletTraceId { + pub fn new(path: &str, db: &DB) -> Self { + let mut name = path.split('/'); + let name = name.next_back().unwrap(); + let parts: Vec<_> = name.split('_').collect(); + if parts.len() == 2 { + let id: u64 = parts[0].parse().unwrap(); + let suffix: u64 = parts[1].parse().unwrap(); + let bt = BacktraceInfo::default(); + let key = TabletTraceKey { + region_id: id, + suffix, + addr: db as *const _ as u64, + alloc_id: inc_id(), + }; + TABLET_TRACE.lock().unwrap().insert(key, bt); + Self(key) + } else { + Self(Default::default()) + } + } + } + + impl Clone for TabletTraceId { + fn clone(&self) -> Self { + if self.0.region_id != 0 { + let bt = BacktraceInfo::default(); + let mut key = self.0; + key.alloc_id = inc_id(); + TABLET_TRACE.lock().unwrap().insert(key, bt); + Self(key) + } else { + Self(self.0) + } + } + } + + impl Drop for TabletTraceId { + fn drop(&mut self) { + if self.0.region_id != 0 { + TABLET_TRACE.lock().unwrap().remove(&self.0); + } + } + } +} + #[derive(Clone, Debug)] pub struct RocksEngine { db: Arc, - shared_block_cache: bool, + support_multi_batch_write: bool, + #[cfg(feature = "trace-lifetime")] + _id: trace::TabletTraceId, } impl RocksEngine { - pub fn from_db(db: Arc) -> Self { + pub fn new(db: DB) -> RocksEngine { + let db = Arc::new(db); RocksEngine { + support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), + #[cfg(feature = "trace-lifetime")] + _id: trace::TabletTraceId::new(db.path(), &db), db, - shared_block_cache: false, } } - pub fn from_ref(db: &Arc) -> &Self { - unsafe { &*(db as *const Arc as *const RocksEngine) } - } - pub fn as_inner(&self) -> &Arc { &self.db } @@ -47,20 +169,13 @@ impl RocksEngine { self.db.clone() } - pub fn exists(path: &str) -> bool { - let path = Path::new(path); - if !path.exists() || !path.is_dir() { - return false; - } - - // If path is not an empty directory, we say db exists. If path is not an empty directory - // but db has not been created, `DB::list_column_families` fails and we can clean up - // the directory by this indication. - fs::read_dir(&path).unwrap().next().is_some() + pub fn support_multi_batch_write(&self) -> bool { + self.support_multi_batch_write } - pub fn set_shared_block_cache(&mut self, enable: bool) { - self.shared_block_cache = enable; + #[cfg(feature = "trace-lifetime")] + pub fn trace(region_id: u64) -> Vec { + trace::list(region_id) } } @@ -72,56 +187,24 @@ impl KvEngine for RocksEngine { } fn sync(&self) -> Result<()> { - self.db.sync_wal().map_err(Error::Engine) - } - - fn flush_metrics(&self, instance: &str) { - for t in ENGINE_TICKER_TYPES { - let v = self.db.get_and_reset_statistics_ticker_count(*t); - flush_engine_ticker_metrics(*t, v, instance); - } - for t in ENGINE_HIST_TYPES { - if let Some(v) = self.db.get_statistics_histogram(*t) { - flush_engine_histogram_metrics(*t, v, instance); - } - } - if self.db.is_titan() { - for t in TITAN_ENGINE_TICKER_TYPES { - let v = self.db.get_and_reset_statistics_ticker_count(*t); - flush_engine_ticker_metrics(*t, v, instance); - } - for t in TITAN_ENGINE_HIST_TYPES { - if let Some(v) = self.db.get_statistics_histogram(*t) { - flush_engine_histogram_metrics(*t, v, instance); - } - } - } - flush_engine_properties(&self.db, instance, self.shared_block_cache); - flush_engine_iostall_properties(&self.db, instance); - } - - fn reset_statistics(&self) { - self.db.reset_statistics(); + self.db.sync_wal().map_err(r2e) } fn bad_downcast(&self) -> &T { let e: &dyn Any = &self.db; e.downcast_ref().expect("bad engine downcast") } + + #[cfg(any(test, feature = "testexport"))] + fn inner_refcount(&self) -> usize { + Arc::strong_count(&self.db) + } } impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - let opt: RocksReadOptions = opts.into(); - Ok(RocksEngineIterator::from_raw(DBIterator::new( - self.db.clone(), - opt.into_raw(), - ))) - } - - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { let handle = get_cf_handle(&self.db, cf)?; let opt: RocksReadOptions = opts.into(); Ok(RocksEngineIterator::from_raw(DBIterator::new_cf( @@ -133,12 +216,12 @@ impl Iterable for RocksEngine { } impl Peekable for RocksEngine { - type DBVector = RocksDBVector; + type DbVector = RocksDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { let opt: RocksReadOptions = opts.into(); - let v = self.db.get_opt(key, &opt.into_raw())?; - Ok(v.map(RocksDBVector::from_raw)) + let v = self.db.get_opt(key, &opt.into_raw()).map_err(r2e)?; + Ok(v.map(RocksDbVector::from_raw)) } fn get_value_cf_opt( @@ -146,64 +229,61 @@ impl Peekable for RocksEngine { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { let opt: RocksReadOptions = opts.into(); let handle = get_cf_handle(&self.db, cf)?; - let v = self.db.get_cf_opt(handle, key, &opt.into_raw())?; - Ok(v.map(RocksDBVector::from_raw)) + let v = self + .db + .get_cf_opt(handle, key, &opt.into_raw()) + .map_err(r2e)?; + Ok(v.map(RocksDbVector::from_raw)) } } impl SyncMutable for RocksEngine { fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { - self.db.put(key, value).map_err(Error::Engine) + self.db.put(key, value).map_err(r2e) } fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { let handle = get_cf_handle(&self.db, cf)?; - self.db.put_cf(handle, key, value).map_err(Error::Engine) + self.db.put_cf(handle, key, value).map_err(r2e) } fn delete(&self, key: &[u8]) -> Result<()> { - self.db.delete(key).map_err(Error::Engine) + self.db.delete(key).map_err(r2e) } fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { let handle = get_cf_handle(&self.db, cf)?; - self.db.delete_cf(handle, key).map_err(Error::Engine) + self.db.delete_cf(handle, key).map_err(r2e) } fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - self.db - .delete_range(begin_key, end_key) - .map_err(Error::Engine) + self.db.delete_range(begin_key, end_key).map_err(r2e) } fn delete_range_cf(&self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { let handle = get_cf_handle(&self.db, cf)?; self.db .delete_range_cf(handle, begin_key, end_key) - .map_err(Error::Engine) + .map_err(r2e) } } #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_traits::{Iterable, KvEngine, Peekable, SyncMutable}; + use engine_traits::{Iterable, KvEngine, Peekable, SyncMutable, CF_DEFAULT}; use kvproto::metapb::Region; use tempfile::Builder; - use crate::{raw_util, RocksEngine, RocksSnapshot}; + use crate::{util, RocksSnapshot}; #[test] fn test_base() { let path = Builder::new().prefix("var").tempdir().unwrap(); let cf = "cf"; - let engine = RocksEngine::from_db(Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &[cf], None).unwrap(), - )); + let engine = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, cf]).unwrap(); let mut r = Region::default(); r.set_id(10); @@ -238,15 +318,13 @@ mod tests { fn test_peekable() { let path = Builder::new().prefix("var").tempdir().unwrap(); let cf = "cf"; - let engine = RocksEngine::from_db(Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &[cf], None).unwrap(), - )); + let engine = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, cf]).unwrap(); engine.put(b"k1", b"v1").unwrap(); engine.put_cf(cf, b"k1", b"v2").unwrap(); assert_eq!(&*engine.get_value(b"k1").unwrap().unwrap(), b"v1"); - assert!(engine.get_value_cf("foo", b"k1").is_err()); + engine.get_value_cf("foo", b"k1").unwrap_err(); assert_eq!(&*engine.get_value_cf(cf, b"k1").unwrap().unwrap(), b"v2"); } @@ -254,9 +332,7 @@ mod tests { fn test_scan() { let path = Builder::new().prefix("var").tempdir().unwrap(); let cf = "cf"; - let engine = RocksEngine::from_db(Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &[cf], None).unwrap(), - )); + let engine = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, cf]).unwrap(); engine.put(b"a1", b"v1").unwrap(); engine.put(b"a2", b"v2").unwrap(); @@ -265,7 +341,7 @@ mod tests { let mut data = vec![]; engine - .scan(b"", &[0xFF, 0xFF], false, |key, value| { + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -280,7 +356,7 @@ mod tests { data.clear(); engine - .scan_cf(cf, b"", &[0xFF, 0xFF], false, |key, value| { + .scan(cf, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -294,16 +370,16 @@ mod tests { ); data.clear(); - let pair = engine.seek(b"a1").unwrap().unwrap(); + let pair = engine.seek(CF_DEFAULT, b"a1").unwrap().unwrap(); assert_eq!(pair, (b"a1".to_vec(), b"v1".to_vec())); - assert!(engine.seek(b"a3").unwrap().is_none()); - let pair_cf = engine.seek_cf(cf, b"a1").unwrap().unwrap(); + assert!(engine.seek(CF_DEFAULT, b"a3").unwrap().is_none()); + let pair_cf = engine.seek(cf, b"a1").unwrap().unwrap(); assert_eq!(pair_cf, (b"a1".to_vec(), b"v1".to_vec())); - assert!(engine.seek_cf(cf, b"a3").unwrap().is_none()); + assert!(engine.seek(cf, b"a3").unwrap().is_none()); let mut index = 0; engine - .scan(b"", &[0xFF, 0xFF], false, |key, value| { + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); index += 1; Ok(index != 1) @@ -315,15 +391,15 @@ mod tests { let snap = RocksSnapshot::new(engine.get_sync_db()); engine.put(b"a3", b"v3").unwrap(); - assert!(engine.seek(b"a3").unwrap().is_some()); + assert!(engine.seek(CF_DEFAULT, b"a3").unwrap().is_some()); - let pair = snap.seek(b"a1").unwrap().unwrap(); + let pair = snap.seek(CF_DEFAULT, b"a1").unwrap().unwrap(); assert_eq!(pair, (b"a1".to_vec(), b"v1".to_vec())); - assert!(snap.seek(b"a3").unwrap().is_none()); + assert!(snap.seek(CF_DEFAULT, b"a3").unwrap().is_none()); data.clear(); - snap.scan(b"", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) diff --git a/components/engine_rocks/src/engine_iterator.rs b/components/engine_rocks/src/engine_iterator.rs index fcc10237510..de51b32c8f4 100644 --- a/components/engine_rocks/src/engine_iterator.rs +++ b/components/engine_rocks/src/engine_iterator.rs @@ -2,8 +2,10 @@ use std::sync::Arc; -use engine_traits::{self, Error, Result}; -use rocksdb::{DBIterator, SeekKey as RawSeekKey, DB}; +use engine_traits::{self, Result}; +use rocksdb::{DBIterator, DB}; + +use crate::r2e; // FIXME: Would prefer using &DB instead of Arc. As elsewhere in // this crate, it would require generic associated types. @@ -20,30 +22,38 @@ impl RocksEngineIterator { } impl engine_traits::Iterator for RocksEngineIterator { - fn seek(&mut self, key: engine_traits::SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek(k.into_raw()).map_err(Error::Engine) + fn seek(&mut self, key: &[u8]) -> Result { + self.0.seek(rocksdb::SeekKey::Key(key)).map_err(r2e) + } + + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + self.0 + .seek_for_prev(rocksdb::SeekKey::Key(key)) + .map_err(r2e) } - fn seek_for_prev(&mut self, key: engine_traits::SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek_for_prev(k.into_raw()).map_err(Error::Engine) + fn seek_to_first(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::Start).map_err(r2e) + } + + fn seek_to_last(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::End).map_err(r2e) } fn prev(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.prev().map_err(Error::Engine) + self.0.prev().map_err(r2e) } fn next(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.next().map_err(Error::Engine) + self.0.next().map_err(r2e) } fn key(&self) -> &[u8] { @@ -59,25 +69,6 @@ impl engine_traits::Iterator for RocksEngineIterator { } fn valid(&self) -> Result { - self.0.valid().map_err(Error::Engine) - } -} - -pub struct RocksSeekKey<'a>(RawSeekKey<'a>); - -impl<'a> RocksSeekKey<'a> { - pub fn into_raw(self) -> RawSeekKey<'a> { - self.0 - } -} - -impl<'a> From> for RocksSeekKey<'a> { - fn from(key: engine_traits::SeekKey<'a>) -> Self { - let k = match key { - engine_traits::SeekKey::Start => RawSeekKey::Start, - engine_traits::SeekKey::End => RawSeekKey::End, - engine_traits::SeekKey::Key(k) => RawSeekKey::Key(k), - }; - RocksSeekKey(k) + self.0.valid().map_err(r2e) } } diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 86b8e4fdcae..1cbef379e3c 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -1,10 +1,11 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use file_system::{get_io_type, set_io_type, IOType}; +use engine_traits::PersistenceListener; +use file_system::{get_io_type, set_io_type, IoType}; use regex::Regex; use rocksdb::{ - CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MutableStatus, - SubcompactionJobInfo, WriteStallInfo, + CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MemTableInfo, + MutableStatus, SubcompactionJobInfo, WriteStallInfo, }; use tikv_util::{error, metrics::CRITICAL_ERROR, set_panic_mark, warn, worker::Scheduler}; @@ -32,23 +33,23 @@ impl RocksEventListener { impl rocksdb::EventListener for RocksEventListener { fn on_flush_begin(&self, _info: &FlushJobInfo) { - set_io_type(IOType::Flush); + set_io_type(IoType::Flush); } fn on_flush_completed(&self, info: &FlushJobInfo) { STORE_ENGINE_EVENT_COUNTER_VEC .with_label_values(&[&self.db_name, info.cf_name(), "flush"]) .inc(); - if get_io_type() == IOType::Flush { - set_io_type(IOType::Other); + if get_io_type() == IoType::Flush { + set_io_type(IoType::Other); } } fn on_compaction_begin(&self, info: &CompactionJobInfo) { if info.base_input_level() == 0 { - set_io_type(IOType::LevelZeroCompaction); + set_io_type(IoType::LevelZeroCompaction); } else { - set_io_type(IOType::Compaction); + set_io_type(IoType::Compaction); } } @@ -69,26 +70,26 @@ impl rocksdb::EventListener for RocksEventListener { &info.compaction_reason().to_string(), ]) .inc(); - if info.base_input_level() == 0 && get_io_type() == IOType::LevelZeroCompaction - || info.base_input_level() != 0 && get_io_type() == IOType::Compaction + if info.base_input_level() == 0 && get_io_type() == IoType::LevelZeroCompaction + || info.base_input_level() != 0 && get_io_type() == IoType::Compaction { - set_io_type(IOType::Other); + set_io_type(IoType::Other); } } fn on_subcompaction_begin(&self, info: &SubcompactionJobInfo) { if info.base_input_level() == 0 { - set_io_type(IOType::LevelZeroCompaction); + set_io_type(IoType::LevelZeroCompaction); } else { - set_io_type(IOType::Compaction); + set_io_type(IoType::Compaction); } } fn on_subcompaction_completed(&self, info: &SubcompactionJobInfo) { - if info.base_input_level() == 0 && get_io_type() == IOType::LevelZeroCompaction - || info.base_input_level() != 0 && get_io_type() == IOType::Compaction + if info.base_input_level() == 0 && get_io_type() == IoType::LevelZeroCompaction + || info.base_input_level() != 0 && get_io_type() == IoType::Compaction { - set_io_type(IOType::Other); + set_io_type(IoType::Other); } } @@ -162,8 +163,10 @@ impl rocksdb::EventListener for RocksEventListener { } // Here are some expected error examples: +// ```text // 1. Corruption: Sst file size mismatch: /qps/data/tikv-10014/db/000398.sst. Size recorded in manifest 6975, actual size 6959 // 2. Corruption: Bad table magic number: expected 9863518390377041911, found 759105309091689679 in /qps/data/tikv-10014/db/000021.sst +// ``` // // We assume that only the corruption sst file path is printed inside error. fn resolve_sst_filename_from_err(err: &str) -> Option { @@ -176,9 +179,54 @@ fn resolve_sst_filename_from_err(err: &str) -> Option { Some(filename) } +pub struct RocksPersistenceListener(PersistenceListener); + +impl RocksPersistenceListener { + pub fn new(listener: PersistenceListener) -> RocksPersistenceListener { + RocksPersistenceListener(listener) + } +} + +impl rocksdb::EventListener for RocksPersistenceListener { + fn on_memtable_sealed(&self, info: &MemTableInfo) { + // Note: first_seqno is effectively the smallest seqno of memtable. + // earliest_seqno has ambiguous semantics. + self.0 + .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); + } + + fn on_flush_completed(&self, job: &FlushJobInfo) { + let num = match job + .file_path() + .file_prefix() + .and_then(|n| n.to_str()) + .map(|n| n.parse()) + { + Some(Ok(n)) => n, + _ => { + slog_global::error!("failed to parse file number"; "path" => job.file_path().display()); + 0 + } + }; + self.0 + .on_flush_completed(job.cf_name(), job.largest_seqno(), num); + } +} + #[cfg(test)] mod tests { + use std::sync::{ + mpsc::{self, Sender}, + Arc, Mutex, + }; + + use engine_traits::{ + ApplyProgress, FlushState, MiscExt, StateStorage, SyncMutable, CF_DEFAULT, DATA_CFS, + }; + use tempfile::Builder; + use super::*; + use crate::{util, RocksCfOptions, RocksDbOptions}; #[test] fn test_resolve_sst_filename() { @@ -186,4 +234,139 @@ mod tests { let filename = resolve_sst_filename_from_err(err).unwrap(); assert_eq!(filename, "/000398.sst"); } + + type Record = (u64, u64, ApplyProgress); + + #[derive(Default)] + struct MemStorage { + records: Mutex>, + } + + impl StateStorage for MemStorage { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress) { + self.records + .lock() + .unwrap() + .push((region_id, tablet_index, pr)); + } + } + + struct FlushTrack { + sealed: Mutex>, + block_flush: Arc>, + } + + impl rocksdb::EventListener for FlushTrack { + fn on_memtable_sealed(&self, _: &MemTableInfo) { + let _ = self.sealed.lock().unwrap().send(()); + } + + fn on_flush_begin(&self, _: &FlushJobInfo) { + drop(self.block_flush.lock().unwrap()) + } + } + + #[test] + fn test_persistence_listener() { + let temp_dir = Builder::new() + .prefix("test_persistence_listener") + .tempdir() + .unwrap(); + let (region_id, tablet_index) = (2, 3); + + let storage = Arc::new(MemStorage::default()); + let state = Arc::new(FlushState::new(0)); + let listener = + PersistenceListener::new(region_id, tablet_index, state.clone(), storage.clone()); + let mut db_opt = RocksDbOptions::default(); + db_opt.add_event_listener(RocksPersistenceListener::new(listener)); + let (tx, rx) = mpsc::channel(); + let block_flush = Arc::new(Mutex::new(())); + db_opt.add_event_listener(FlushTrack { + sealed: Mutex::new(tx), + block_flush: block_flush.clone(), + }); + + let mut cf_opts: Vec<_> = DATA_CFS + .iter() + .map(|cf| (*cf, RocksCfOptions::default())) + .collect(); + cf_opts[0].1.set_max_write_buffer_number(4); + cf_opts[0].1.set_min_write_buffer_number_to_merge(2); + cf_opts[0].1.set_write_buffer_size(1024); + cf_opts[0].1.set_disable_auto_compactions(true); + let db = util::new_engine_opt(temp_dir.path().to_str().unwrap(), db_opt, cf_opts).unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + let sst_count = || { + std::fs::read_dir(temp_dir.path()) + .unwrap() + .filter(|p| { + let p = match p { + Ok(p) => p, + Err(_) => return false, + }; + p.path().extension().map_or(false, |ext| ext == "sst") + }) + .count() + }; + // Although flush is triggered, but there is nothing to flush. + assert_eq!(sst_count(), 0); + assert_eq!(storage.records.lock().unwrap().len(), 0); + + // Flush one key should work. + state.set_applied_index(2); + db.put_cf(CF_DEFAULT, b"k0", b"v0").unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + assert_eq!(sst_count(), 1); + let record = storage.records.lock().unwrap().pop().unwrap(); + assert_eq!(storage.records.lock().unwrap().len(), 0); + assert_eq!(record.0, region_id); + assert_eq!(record.1, tablet_index); + assert_eq!(record.2.applied_index(), 2); + + // When puts and deletes are mixed, the puts may be deleted during flush. + state.set_applied_index(3); + db.put_cf(CF_DEFAULT, b"k0", b"v0").unwrap(); + db.delete_cf(CF_DEFAULT, b"k0").unwrap(); + db.delete_cf(CF_DEFAULT, b"k1").unwrap(); + db.put_cf(CF_DEFAULT, b"k1", b"v1").unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + assert_eq!(sst_count(), 2); + let record = storage.records.lock().unwrap().pop().unwrap(); + assert_eq!(storage.records.lock().unwrap().len(), 0); + assert_eq!(record.0, region_id); + assert_eq!(record.1, tablet_index); + assert_eq!(record.2.applied_index(), 3); + // Detail check of `FlushProgress` will be done in raftstore-v2 tests. + + // Drain all the events. + while rx.try_recv().is_ok() {} + state.set_applied_index(4); + let block = block_flush.lock(); + // Seal twice to trigger flush. Seal third to make a seqno conflict, in + // which case flush largest seqno will be equal to seal earliest seqno. + let mut key_count = 2; + for i in 0..3 { + while rx.try_recv().is_err() { + db.put(format!("k{key_count}").as_bytes(), &[0; 512]) + .unwrap(); + key_count += 1; + } + state.set_applied_index(5 + i); + } + drop(block); + // Memtable is seal before put, so there must be still one KV in memtable. + db.flush_cf(CF_DEFAULT, true).unwrap(); + rx.try_recv().unwrap(); + // There is 2 sst before this round, and then 4 are merged into 2, so there + // should be 4 ssts. + assert_eq!(sst_count(), 4); + let records = storage.records.lock().unwrap(); + // Although it seals 4 times, but only create 2 SSTs, so only 2 records. + assert_eq!(records.len(), 2); + // The indexes of two merged flush state are 4 and 5, so merged value is 5. + assert_eq!(records[0].2.applied_index(), 5); + // The last two flush state is 6 and 7. + assert_eq!(records[1].2.applied_index(), 7); + } } diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index a9eebc161af..b470237f313 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -5,20 +5,23 @@ use std::sync::Arc; use engine_traits::{EngineFileSystemInspector, FileSystemInspector}; use rocksdb::FileSystemInspector as DBFileSystemInspector; -use crate::raw::Env; +use crate::{e2r, r2e, raw::Env}; // Use engine::Env directly since Env is not abstracted. pub(crate) fn get_env( base_env: Option>, - limiter: Option>, -) -> Result, String> { + limiter: Option>, +) -> engine_traits::Result> { let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); - Ok(Arc::new(Env::new_file_system_inspected_env( - base_env, - WrappedFileSystemInspector { - inspector: EngineFileSystemInspector::from_limiter(limiter), - }, - )?)) + Ok(Arc::new( + Env::new_file_system_inspected_env( + base_env, + WrappedFileSystemInspector { + inspector: EngineFileSystemInspector::from_limiter(limiter), + }, + ) + .map_err(r2e)?, + )) } pub struct WrappedFileSystemInspector { @@ -27,11 +30,11 @@ pub struct WrappedFileSystemInspector { impl DBFileSystemInspector for WrappedFileSystemInspector { fn read(&self, len: usize) -> Result { - self.inspector.read(len) + self.inspector.read(len).map_err(e2r) } fn write(&self, len: usize) -> Result { - self.inspector.write(len) + self.inspector.write(len).map_err(e2r) } } @@ -39,38 +42,35 @@ impl DBFileSystemInspector for WrappedFileSystemInspecto mod tests { use std::sync::Arc; - use engine_traits::{CompactExt, CF_DEFAULT}; - use file_system::{IOOp, IORateLimiter, IORateLimiterStatistics, IOType}; + use engine_traits::{CompactExt, MiscExt, SyncMutable, CF_DEFAULT}; + use file_system::{IoOp, IoRateLimiter, IoRateLimiterStatistics, IoType}; use keys::data_key; - use rocksdb::{DBOptions, Writable, DB}; use tempfile::Builder; use super::*; use crate::{ - compat::Compat, - event_listener::RocksEventListener, - raw::{ColumnFamilyOptions, DBCompressionType}, - raw_util::{new_engine_opt, CFOptions}, + event_listener::RocksEventListener, raw::DBCompressionType, util::new_engine_opt, + RocksCfOptions, RocksDbOptions, RocksEngine, }; - fn new_test_db(dir: &str) -> (Arc, Arc) { - let limiter = Arc::new(IORateLimiter::new_for_test()); - let mut db_opts = DBOptions::new(); + fn new_test_db(dir: &str) -> (RocksEngine, Arc) { + let limiter = Arc::new(IoRateLimiter::new_for_test()); + let mut db_opts = RocksDbOptions::default(); db_opts.add_event_listener(RocksEventListener::new("test_db", None)); let env = get_env(None, Some(limiter.clone())).unwrap(); db_opts.set_env(env); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_disable_auto_compactions(true); cf_opts.compression_per_level(&[DBCompressionType::No; 7]); - let db = Arc::new( - new_engine_opt(dir, db_opts, vec![CFOptions::new(CF_DEFAULT, cf_opts)]).unwrap(), - ); + let db = new_engine_opt(dir, db_opts, vec![(CF_DEFAULT, cf_opts)]).unwrap(); (db, limiter.statistics().unwrap()) } #[test] fn test_inspected_compact() { - let value_size = 1024; + // NOTICE: Specific to RocksDB version. + let amplification_bytes = 2560; + let value_size = amplification_bytes * 2; let temp_dir = Builder::new() .prefix("test_inspected_compact") .tempdir() @@ -81,27 +81,33 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); - db.flush(true /*sync*/).unwrap(); - assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); - assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 3); + assert_eq!(stats.fetch(IoType::Flush, IoOp::Write), 0); + db.flush_cfs(&[], true /* wait */).unwrap(); + assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); + assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); - db.flush(true /*sync*/).unwrap(); - assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); - assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 3); + db.flush_cfs(&[], true /* wait */).unwrap(); + assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); + assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); - db.c() - .compact_range( - CF_DEFAULT, None, /*start_key*/ - None, /*end_key*/ - false, /*exclusive_manual*/ - 1, /*max_subcompactions*/ - ) - .unwrap(); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) > value_size * 4); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) < value_size * 5); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) > value_size * 3); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) < value_size * 4); + db.compact_range_cf( + CF_DEFAULT, None, // start_key + None, // end_key + false, // exclusive_manual + 1, // max_subcompactions + ) + .unwrap(); + assert!(stats.fetch(IoType::LevelZeroCompaction, IoOp::Read) > value_size * 4); + assert!( + stats.fetch(IoType::LevelZeroCompaction, IoOp::Read) + < value_size * 4 + amplification_bytes + ); + assert!(stats.fetch(IoType::LevelZeroCompaction, IoOp::Write) > value_size * 3); + assert!( + stats.fetch(IoType::LevelZeroCompaction, IoOp::Write) + < value_size * 3 + amplification_bytes + ); } } diff --git a/components/engine_rocks/src/flow_listener.rs b/components/engine_rocks/src/flow_listener.rs index 5d36c2b66e9..4a4f80cc46f 100644 --- a/components/engine_rocks/src/flow_listener.rs +++ b/components/engine_rocks/src/flow_listener.rs @@ -5,26 +5,54 @@ use std::sync::{mpsc::Sender, Arc, Mutex}; use collections::hash_set_with_capacity; use rocksdb::{CompactionJobInfo, EventListener, FlushJobInfo, IngestionInfo}; +#[derive(Clone)] pub enum FlowInfo { - L0(String, u64), - L0Intra(String, u64), - Flush(String, u64), - Compaction(String), - BeforeUnsafeDestroyRange, - AfterUnsafeDestroyRange, + L0(String, u64, u64), + L0Intra(String, u64, u64), + Flush(String, u64, u64), + Compaction(String, u64), + BeforeUnsafeDestroyRange(u64), + AfterUnsafeDestroyRange(u64), + Created(u64), + Destroyed(u64), } #[derive(Clone)] pub struct FlowListener { flow_info_sender: Arc>>, + region_id: u64, } impl FlowListener { pub fn new(flow_info_sender: Sender) -> Self { Self { flow_info_sender: Arc::new(Mutex::new(flow_info_sender)), + region_id: 0, + } + } + + pub fn clone_with(&self, region_id: u64) -> Self { + Self { + flow_info_sender: self.flow_info_sender.clone(), + region_id, } } + + pub fn on_created(&self) { + let _ = self + .flow_info_sender + .lock() + .unwrap() + .send(FlowInfo::Created(self.region_id)); + } + + pub fn on_destroyed(&self) { + let _ = self + .flow_info_sender + .lock() + .unwrap() + .send(FlowInfo::Destroyed(self.region_id)); + } } impl EventListener for FlowListener { @@ -32,11 +60,11 @@ impl EventListener for FlowListener { let mut total = 0; let p = info.table_properties(); total += p.data_size() + p.index_size() + p.filter_size(); - let _ = self - .flow_info_sender - .lock() - .unwrap() - .send(FlowInfo::Flush(info.cf_name().to_owned(), total)); + let _ = self.flow_info_sender.lock().unwrap().send(FlowInfo::Flush( + info.cf_name().to_owned(), + total, + self.region_id, + )); } fn on_external_file_ingested(&self, info: &IngestionInfo) { @@ -45,18 +73,21 @@ impl EventListener for FlowListener { let mut total = 0; let p = info.table_properties(); total += p.data_size() + p.index_size() + p.filter_size(); - let _ = self - .flow_info_sender - .lock() - .unwrap() - .send(FlowInfo::Flush(info.cf_name().to_owned(), total)); + let _ = self.flow_info_sender.lock().unwrap().send(FlowInfo::Flush( + info.cf_name().to_owned(), + total, + self.region_id, + )); } else { // ingestion may change the pending bytes. let _ = self .flow_info_sender .lock() .unwrap() - .send(FlowInfo::Compaction(info.cf_name().to_owned())); + .send(FlowInfo::Compaction( + info.cf_name().to_owned(), + self.region_id, + )); } } @@ -97,7 +128,11 @@ impl EventListener for FlowListener { .flow_info_sender .lock() .unwrap() - .send(FlowInfo::L0Intra(info.cf_name().to_owned(), diff)); + .send(FlowInfo::L0Intra( + info.cf_name().to_owned(), + diff, + self.region_id, + )); } else { let l0_input_file_at_input_level = info.input_file_count() - info.num_input_files_at_output_level(); @@ -116,11 +151,11 @@ impl EventListener for FlowListener { } } - let _ = self - .flow_info_sender - .lock() - .unwrap() - .send(FlowInfo::L0(info.cf_name().to_owned(), read_bytes)); + let _ = self.flow_info_sender.lock().unwrap().send(FlowInfo::L0( + info.cf_name().to_owned(), + read_bytes, + self.region_id, + )); } } @@ -128,6 +163,9 @@ impl EventListener for FlowListener { .flow_info_sender .lock() .unwrap() - .send(FlowInfo::Compaction(info.cf_name().to_owned())); + .send(FlowInfo::Compaction( + info.cf_name().to_owned(), + self.region_id, + )); } } diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index 1cfe24cb8e4..1aa65ec07fa 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -7,7 +7,7 @@ use rocksdb::{ set_external_sst_file_global_seq_no, IngestExternalFileOptions as RawIngestExternalFileOptions, }; -use crate::{engine::RocksEngine, util}; +use crate::{engine::RocksEngine, r2e, util}; impl ImportExt for RocksEngine { type IngestExternalFileOptions = RocksIngestExternalFileOptions; @@ -19,13 +19,14 @@ impl ImportExt for RocksEngine { opts.set_write_global_seqno(false); files.iter().try_for_each(|file| -> Result<()> { let f = File::open(file)?; - // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For backward - // compatibility, in case TiKV is retrying an ingestion job generated by older - // version, it needs to reset the global seqno to 0. - set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0)?; + // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For + // backward compatibility, in case TiKV is retrying an ingestion job + // generated by older version, it needs to reset the global seqno to + // 0. + set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0).map_err(r2e)?; f.sync_all() - .map_err(|e| format!("sync {}: {:?}", file, e))?; - Ok(()) + .map_err(|e| format!("sync {}: {:?}", file, e)) + .map_err(r2e) })?; // This is calling a specially optimized version of // ingest_external_file_cf. In cases where the memtable needs to be @@ -34,7 +35,8 @@ impl ImportExt for RocksEngine { // the manual memtable flush was taken. let _did_nonblocking_memtable_flush = self .as_inner() - .ingest_external_file_optimized(cf, &opts.0, files)?; + .ingest_external_file_optimized(cf, &opts.0, files) + .map_err(r2e)?; Ok(()) } } @@ -61,8 +63,6 @@ impl IngestExternalFileOptions for RocksIngestExternalFileOptions { #[cfg(test)] mod tests { - use std::sync::Arc; - use engine_traits::{ FlowControlFactorsExt, MiscExt, Mutable, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, CF_DEFAULT, @@ -70,12 +70,7 @@ mod tests { use tempfile::Builder; use super::*; - use crate::{ - engine::RocksEngine, - raw::{ColumnFamilyOptions, DBOptions}, - raw_util::{new_engine_opt, CFOptions}, - RocksSstWriterBuilder, - }; + use crate::{util::new_engine_opt, RocksCfOptions, RocksDbOptions, RocksSstWriterBuilder}; #[test] fn test_ingest_multiple_file() { @@ -90,14 +85,12 @@ mod tests { let cfs_opts = ALL_CFS .iter() .map(|cf| { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = RocksCfOptions::default(); opt.set_force_consistency_checks(true); - CFOptions::new(cf, opt) + (*cf, opt) }) .collect(); - let db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); let mut wb = db.write_batch(); for i in 1000..5000 { let v = i.to_string(); diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 7cf4d948d0d..b5561b3de42 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -10,11 +10,15 @@ //! Because there are so many similarly named types across the TiKV codebase, //! and so much "import renaming", this crate consistently explicitly names type //! that implement a trait as `RocksTraitname`, to avoid the need for import -//! renaming and make it obvious what type any particular module is working with. +//! renaming and make it obvious what type any particular module is working +//! with. //! //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] +#![feature(let_chains)] +#![feature(option_get_or_insert_default)] +#![feature(path_file_prefix)] #[allow(unused_extern_crates)] extern crate tikv_alloc; @@ -26,6 +30,8 @@ mod cf_names; pub use crate::cf_names::*; mod cf_options; pub use crate::cf_options::*; +mod checkpoint; +pub use crate::checkpoint::*; mod compact; pub use crate::compact::*; mod db_options; @@ -47,6 +53,8 @@ mod sst; pub use crate::sst::*; mod sst_partitioner; pub use crate::sst_partitioner::*; +mod status; +pub use crate::status::*; mod table_properties; pub use crate::table_properties::*; mod write_batch; @@ -64,13 +72,9 @@ mod perf_context_metrics; mod engine_iterator; pub use crate::engine_iterator::*; -mod options; -pub mod raw_util; +pub mod options; pub mod util; -mod compat; -pub use compat::*; - mod compact_listener; pub use compact_listener::*; @@ -103,7 +107,10 @@ pub mod file_system; mod raft_engine; -pub use rocksdb::{set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel}; +pub use rocksdb::{ + set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel, + Statistics as RocksStatistics, +}; pub mod flow_control_factors; pub use flow_control_factors::*; @@ -112,8 +119,8 @@ pub mod raw; pub fn get_env( key_manager: Option>, - limiter: Option>, -) -> std::result::Result, String> { - let env = encryption::get_env(None /*base_env*/, key_manager)?; + limiter: Option>, +) -> engine_traits::Result> { + let env = encryption::get_env(None /* base_env */, key_manager)?; file_system::get_env(Some(env), limiter) } diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index 9482dd12d25..85f4de713ac 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -20,10 +20,34 @@ impl Logger for RocksdbLogger { } } +pub struct TabletLogger { + tablet_name: String, +} + +impl TabletLogger { + pub fn new(tablet_name: String) -> Self { + Self { tablet_name } + } +} + +impl Logger for TabletLogger { + fn logv(&self, log_level: InfoLogLevel, log: &str) { + match log_level { + InfoLogLevel::Header => info!(#"rocksdb_log_header", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Debug => debug!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Info => info!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Warn => warn!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Error => error!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Fatal => crit!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + _ => {} + } + } +} + #[derive(Default)] -pub struct RaftDBLogger; +pub struct RaftDbLogger; -impl Logger for RaftDBLogger { +impl Logger for RaftDbLogger { fn logv(&self, log_level: InfoLogLevel, log: &str) { match log_level { InfoLogLevel::Header => info!(#"raftdb_log_header", "{}", log), diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 0ae93fe34df..8d5bb3d43ef 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -1,14 +1,15 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{ - CFNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, - Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, + CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, + Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, }; use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, rocks_metrics_defs::*, sst::RocksSstWriterBuilder, util, RocksSstWriter, + engine::RocksEngine, r2e, rocks_metrics::RocksStatisticsReporter, rocks_metrics_defs::*, + sst::RocksSstWriterBuilder, util, RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -18,8 +19,8 @@ impl RocksEngine { self.as_inner().is_titan() } - // We store all data which would be deleted in memory at first because the data of region will never be larger than - // max-region-size. + // We store all data which would be deleted in memory at first because the data + // of region will never be larger than max-region-size. fn delete_all_in_range_cf_by_ingest( &self, cf: &str, @@ -28,17 +29,6 @@ impl RocksEngine { ) -> Result<()> { let mut ranges = ranges.to_owned(); ranges.sort_by(|a, b| a.start_key.cmp(b.start_key)); - let max_end_key = ranges - .iter() - .fold(ranges[0].end_key, |x, y| std::cmp::max(x, y.end_key)); - let start = KeyBuilder::from_slice(ranges[0].start_key, 0, 0); - let end = KeyBuilder::from_slice(max_end_key, 0, 0); - let mut opts = IterOptions::new(Some(start), Some(end), false); - if self.is_titan() { - // Cause DeleteFilesInRange may expose old blob index keys, setting key only for Titan - // to avoid referring to missing blob files. - opts.set_key_only(true); - } let mut writer_wrapper: Option = None; let mut data: Vec> = vec![]; @@ -54,8 +44,18 @@ impl RocksEngine { } last_end_key = Some(r.end_key.to_owned()); - let mut it = self.iterator_cf_opt(cf, opts.clone())?; - let mut it_valid = it.seek(r.start_key.into())?; + let mut opts = IterOptions::new( + Some(KeyBuilder::from_slice(r.start_key, 0, 0)), + Some(KeyBuilder::from_slice(r.end_key, 0, 0)), + false, + ); + if self.is_titan() { + // Cause DeleteFilesInRange may expose old blob index keys, setting key only for + // Titan to avoid referring to missing blob files. + opts.set_key_only(true); + } + let mut it = self.iterator_opt(cf, opts)?; + let mut it_valid = it.seek(r.start_key)?; while it_valid { if it.key() >= r.end_key { break; @@ -102,12 +102,12 @@ impl RocksEngine { let end = KeyBuilder::from_slice(range.end_key, 0, 0); let mut opts = IterOptions::new(Some(start), Some(end), false); if self.is_titan() { - // Cause DeleteFilesInRange may expose old blob index keys, setting key only for Titan - // to avoid referring to missing blob files. + // Cause DeleteFilesInRange may expose old blob index keys, setting key only for + // Titan to avoid referring to missing blob files. opts.set_key_only(true); } - let mut it = self.iterator_cf_opt(cf, opts)?; - let mut it_valid = it.seek(range.start_key.into())?; + let mut it = self.iterator_opt(cf, opts)?; + let mut it_valid = it.seek(range.start_key)?; let mut wb = self.write_batch(); while it_valid { wb.delete_cf(cf, it.key())?; @@ -126,13 +126,24 @@ impl RocksEngine { } impl MiscExt for RocksEngine { - fn flush(&self, sync: bool) -> Result<()> { - Ok(self.as_inner().flush(sync)?) + type StatisticsReporter = RocksStatisticsReporter; + + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { + let mut handles = vec![]; + for cf in cfs { + handles.push(util::get_cf_handle(self.as_inner(), cf)?); + } + if handles.is_empty() { + for cf in self.cf_names() { + handles.push(util::get_cf_handle(self.as_inner(), cf)?); + } + } + self.as_inner().flush_cfs(&handles, wait).map_err(r2e) } - fn flush_cf(&self, cf: &str, sync: bool) -> Result<()> { + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()> { let handle = util::get_cf_handle(self.as_inner(), cf)?; - Ok(self.as_inner().flush_cf(handle, sync)?) + self.as_inner().flush_cf(handle, wait).map_err(r2e) } fn delete_ranges_cf( @@ -147,32 +158,42 @@ impl MiscExt for RocksEngine { match strategy { DeleteStrategy::DeleteFiles => { let handle = util::get_cf_handle(self.as_inner(), cf)?; - for r in ranges { - if r.start_key >= r.end_key { - continue; - } - self.as_inner().delete_files_in_range_cf( - handle, - r.start_key, - r.end_key, - false, - )?; + let rocks_ranges: Vec<_> = ranges + .iter() + .filter_map(|r| { + if r.start_key >= r.end_key { + None + } else { + Some(RocksRange::new(r.start_key, r.end_key)) + } + }) + .collect(); + if rocks_ranges.is_empty() { + return Ok(()); } + self.as_inner() + .delete_files_in_ranges_cf(handle, &rocks_ranges, false) + .map_err(r2e)?; } DeleteStrategy::DeleteBlobs => { let handle = util::get_cf_handle(self.as_inner(), cf)?; if self.is_titan() { - for r in ranges { - if r.start_key >= r.end_key { - continue; - } - self.as_inner().delete_blob_files_in_range_cf( - handle, - r.start_key, - r.end_key, - false, - )?; + let rocks_ranges: Vec<_> = ranges + .iter() + .filter_map(|r| { + if r.start_key >= r.end_key { + None + } else { + Some(RocksRange::new(r.start_key, r.end_key)) + } + }) + .collect(); + if rocks_ranges.is_empty() { + return Ok(()); } + self.as_inner() + .delete_blob_files_in_ranges_cf(handle, &rocks_ranges, false) + .map_err(r2e)?; } } DeleteStrategy::DeleteByRange => { @@ -207,56 +228,72 @@ impl MiscExt for RocksEngine { if let Some(n) = util::get_cf_num_files_at_level(self.as_inner(), handle, 0) { let options = self.as_inner().get_options_cf(handle); let slowdown_trigger = options.get_level_zero_slowdown_writes_trigger(); + let compaction_trigger = options.get_level_zero_file_num_compaction_trigger() as u64; // Leave enough buffer to tolerate heavy write workload, // which may flush some memtables in a short time. - if n > u64::from(slowdown_trigger) / 2 { + if n > u64::from(slowdown_trigger) / 2 && n >= compaction_trigger { return Ok(true); } } Ok(false) } + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + let handle = util::get_cf_handle(self.as_inner(), cf)?; + let ret = self + .as_inner() + .get_column_family_meta_data(handle) + .get_level(level) + .get_files() + .iter() + .map(|sst_meta| { + ( + sst_meta.get_smallestkey().to_vec(), + sst_meta.get_largestkey().to_vec(), + ) + }) + .collect(); + Ok(ret) + } + fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; - for cf in ALL_CFS { + for cf in self.cf_names() { let handle = util::get_cf_handle(self.as_inner(), cf)?; used_size += util::get_engine_cf_used_size(self.as_inner(), handle); } Ok(used_size) } - fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()> { - let db = self.as_inner(); - let mut delete_ranges = Vec::new(); - for &(ref start, ref end) in ranges { - if start == end { - continue; - } - assert!(start < end); - delete_ranges.push(RocksRange::new(start, end)); - } - if delete_ranges.is_empty() { - return Ok(()); - } + fn path(&self) -> &str { + self.as_inner().path() + } - for cf in db.cf_names() { - let handle = util::get_cf_handle(db, cf)?; - db.delete_files_in_ranges_cf(handle, &delete_ranges, /* include_end */ false)?; - } + fn sync_wal(&self) -> Result<()> { + self.as_inner().sync_wal().map_err(r2e) + } + fn pause_background_work(&self) -> Result<()> { + // This will make manual compaction return error instead of waiting. In practice + // we might want to identify this case by parsing error message. + self.as_inner().disable_manual_compaction(); + self.as_inner().pause_bg_work(); Ok(()) } - fn path(&self) -> &str { - self.as_inner().path() + fn continue_background_work(&self) -> Result<()> { + self.as_inner().enable_manual_compaction(); + self.as_inner().continue_bg_work(); + Ok(()) } - fn sync_wal(&self) -> Result<()> { - Ok(self.as_inner().sync_wal()?) + fn exists(path: &str) -> bool { + crate::util::db_exist(path) } - fn exists(path: &str) -> bool { - crate::raw_util::db_exist(path) + fn locked(path: &str) -> Result { + let env = rocksdb::Env::default(); + env.is_db_locked(path).map_err(r2e) } fn dump_stats(&self) -> Result { @@ -279,11 +316,6 @@ impl MiscExt for RocksEngine { s.extend_from_slice(v.as_bytes()); } - // more stats if enable_statistics is true. - if let Some(v) = self.as_inner().get_statistics() { - s.extend_from_slice(v.as_bytes()); - } - Ok(box_try!(String::from_utf8(s))) } @@ -309,6 +341,18 @@ impl MiscExt for RocksEngine { .get_property_int_cf(handle, ROCKSDB_TOTAL_SST_FILES_SIZE)) } + fn get_num_keys(&self) -> Result { + let mut total = 0; + for cf in self.cf_names() { + let handle = util::get_cf_handle(self.as_inner(), cf).unwrap(); + total += self + .as_inner() + .get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) + .unwrap_or_default(); + } + Ok(total) + } + fn get_range_entries_and_versions( &self, cf: &str, @@ -337,24 +381,23 @@ impl MiscExt for RocksEngine { #[cfg(test)] mod tests { - use std::sync::Arc; - use engine_traits::{ - DeleteStrategy, Iterable, Iterator, Mutable, SeekKey, SyncMutable, WriteBatchExt, ALL_CFS, + CompactExt, DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, + ALL_CFS, }; use tempfile::Builder; use super::*; use crate::{ engine::RocksEngine, - raw::{ColumnFamilyOptions, DBOptions, DB}, - raw_util::{new_engine_opt, CFOptions}, + util::{new_engine, new_engine_opt}, + RocksCfOptions, RocksDbOptions, }; fn check_data(db: &RocksEngine, cfs: &[&str], expected: &[(&[u8], &[u8])]) { for cf in cfs { - let mut iter = db.iterator_cf(cf).unwrap(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = db.iterator(cf).unwrap(); + iter.seek_to_first().unwrap(); for &(k, v) in expected { assert_eq!(k, iter.key()); assert_eq!(v, iter.value()); @@ -364,24 +407,14 @@ mod tests { } } - fn test_delete_all_in_range( - strategy: DeleteStrategy, - origin_keys: &[Vec], - ranges: &[Range<'_>], - ) { + fn test_delete_ranges(strategy: DeleteStrategy, origin_keys: &[Vec], ranges: &[Range<'_>]) { let path = Builder::new() - .prefix("engine_delete_all_in_range") + .prefix("engine_delete_ranges") .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) - .collect(); - let db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine(path_str, ALL_CFS).unwrap(); let mut wb = db.write_batch(); let ts: u8 = 12; @@ -406,15 +439,11 @@ mod tests { wb.write().unwrap(); check_data(&db, ALL_CFS, kvs.as_slice()); - // Delete all in ranges. - db.delete_all_in_range(strategy, ranges).unwrap(); + db.delete_ranges_cfs(strategy, ranges).unwrap(); let mut kvs_left: Vec<_> = kvs; for r in ranges { - kvs_left = kvs_left - .into_iter() - .filter(|k| k.0 < r.start_key || k.0 >= r.end_key) - .collect(); + kvs_left.retain(|k| k.0 < r.start_key || k.0 >= r.end_key); } check_data(&db, ALL_CFS, kvs_left.as_slice()); } @@ -429,25 +458,25 @@ mod tests { b"k4".to_vec(), ]; // Single range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k1", b"k4")], ); // Two ranges without overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k0", b"k1"), Range::new(b"k3", b"k4")], ); // Two ranges with overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k1", b"k3"), Range::new(b"k2", b"k4")], ); // One range contains the other range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k1", b"k4"), Range::new(b"k2", b"k3")], @@ -464,25 +493,25 @@ mod tests { b"k4".to_vec(), ]; // Single range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k1", b"k4")], ); // Two ranges without overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k0", b"k1"), Range::new(b"k3", b"k4")], ); // Two ranges with overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k1", b"k3"), Range::new(b"k2", b"k4")], ); // One range contains the other range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k1", b"k4"), Range::new(b"k2", b"k3")], @@ -501,7 +530,7 @@ mod tests { for i in 1000..5000 { data.push(i.to_string().as_bytes().to_vec()); } - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByWriter { sst_path }, &data, &[ @@ -526,14 +555,12 @@ mod tests { let cfs_opts = ALL_CFS .iter() .map(|cf| { - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_level_zero_file_num_compaction_trigger(1); - CFOptions::new(cf, cf_opts) + (*cf, cf_opts) }) .collect(); - let db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); let keys = vec![b"k1", b"k2", b"k3", b"k4"]; @@ -550,9 +577,9 @@ mod tests { } check_data(&db, ALL_CFS, kvs.as_slice()); - db.delete_all_in_range(DeleteStrategy::DeleteFiles, &[Range::new(b"k2", b"k4")]) + db.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[Range::new(b"k2", b"k4")]) .unwrap(); - db.delete_all_in_range(DeleteStrategy::DeleteBlobs, &[Range::new(b"k2", b"k4")]) + db.delete_ranges_cfs(DeleteStrategy::DeleteBlobs, &[Range::new(b"k2", b"k4")]) .unwrap(); check_data(&db, ALL_CFS, kvs_left.as_slice()); } @@ -565,10 +592,11 @@ mod tests { .unwrap(); let path_str = path.path().to_str().unwrap(); - let mut opts = DBOptions::new(); + let mut opts = RocksDbOptions::default(); opts.create_if_missing(true); + opts.enable_multi_batch_write(true); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); // Prefix extractor(trim the timestamp at tail) for write cf. cf_opts .set_prefix_extractor( @@ -579,9 +607,7 @@ mod tests { // Create prefix bloom filter for memtable. cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); let cf = "default"; - let db = DB::open_cf(opts, path_str, vec![(cf, cf_opts)]).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); let mut wb = db.write_batch(); let kvs: Vec<(&[u8], &[u8])> = vec![ (b"kabcdefg1", b"v1"), @@ -598,11 +624,77 @@ mod tests { check_data(&db, &[cf], kvs.as_slice()); // Delete all in ["k2", "k4"). - db.delete_all_in_range( + db.delete_ranges_cfs( DeleteStrategy::DeleteByRange, &[Range::new(b"kabcdefg2", b"kabcdefg4")], ) .unwrap(); check_data(&db, &[cf], kvs_left.as_slice()); } + + #[test] + fn test_get_sst_key_ranges() { + let path = Builder::new() + .prefix("test_get_sst_key_ranges") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + opts.enable_multi_batch_write(true); + + let mut cf_opts = RocksCfOptions::default(); + // Prefix extractor(trim the timestamp at tail) for write cf. + cf_opts + .set_prefix_extractor( + "FixedSuffixSliceTransform", + crate::util::FixedSuffixSliceTransform::new(8), + ) + .unwrap_or_else(|err| panic!("{:?}", err)); + // Create prefix bloom filter for memtable. + cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); + let cf = "default"; + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![ + (b"k1", b"v1"), + (b"k2", b"v2"), + (b"k6", b"v3"), + (b"k7", b"v4"), + ]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k7".to_vec())]; + assert_eq!(sst_range, expected); + + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"v1"), (b"k4", b"v2"), (b"k8", b"v3")]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![ + (b"k3".to_vec(), b"k8".to_vec()), + (b"k1".to_vec(), b"k7".to_vec()), + ]; + assert_eq!(sst_range, expected); + + db.compact_range_cf(cf, None, None, false, 1).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + assert_eq!(sst_range.len(), 0); + let sst_range = db.get_sst_key_ranges(cf, 1).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k8".to_vec())]; + assert_eq!(sst_range, expected); + } } diff --git a/components/engine_rocks/src/options.rs b/components/engine_rocks/src/options.rs index c1610f64224..7579c92ba79 100644 --- a/components/engine_rocks/src/options.rs +++ b/components/engine_rocks/src/options.rs @@ -16,7 +16,7 @@ impl RocksReadOptions { impl From for RocksReadOptions { fn from(opts: engine_traits::ReadOptions) -> Self { let mut r = RawReadOptions::default(); - r.fill_cache(opts.fill_cache()); + r.set_fill_cache(opts.fill_cache()); RocksReadOptions(r) } } @@ -40,6 +40,9 @@ impl From for RocksWriteOptions { let mut r = RawWriteOptions::default(); r.set_sync(opts.sync()); r.set_no_slowdown(opts.no_slowdown()); + r.disable_wal(opts.disable_wal()); + // TODO: enable it. + r.set_memtable_insert_hint_per_batch(false); RocksWriteOptions(r) } } @@ -59,16 +62,20 @@ impl From for RocksReadOptions { fn build_read_opts(iter_opts: engine_traits::IterOptions) -> RawReadOptions { let mut opts = RawReadOptions::new(); - opts.fill_cache(iter_opts.fill_cache()); + opts.set_fill_cache(iter_opts.fill_cache()); opts.set_max_skippable_internal_keys(iter_opts.max_skippable_internal_keys()); if iter_opts.key_only() { opts.set_titan_key_only(true); } if iter_opts.total_order_seek_used() { opts.set_total_order_seek(true); + // TODO: enable it. + opts.set_auto_prefix_mode(false); } else if iter_opts.prefix_same_as_start() { opts.set_prefix_same_as_start(true); } + // TODO: enable it. + opts.set_adaptive_readahead(false); if iter_opts.hint_min_ts().is_some() || iter_opts.hint_max_ts().is_some() { opts.set_table_filter(TsFilter::new( diff --git a/components/engine_rocks/src/perf_context.rs b/components/engine_rocks/src/perf_context.rs index 83ff4bca6bd..f8cfdbcc667 100644 --- a/components/engine_rocks/src/perf_context.rs +++ b/components/engine_rocks/src/perf_context.rs @@ -1,13 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{PerfContext, PerfContextExt, PerfContextKind, PerfLevel}; +use tracker::TrackerToken; use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; impl PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { RocksPerfContext::new(level, kind) } } @@ -30,7 +31,7 @@ impl PerfContext for RocksPerfContext { self.stats.start() } - fn report_metrics(&mut self) { - self.stats.report() + fn report_metrics(&mut self, trackers: &[TrackerToken]) { + self.stats.report(trackers) } } diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index 617abe506d8..59086127154 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -1,39 +1,38 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Debug, marker::PhantomData, ops::Sub}; +use std::{fmt::Debug, marker::PhantomData, mem, ops::Sub, time::Duration}; use derive_more::{Add, AddAssign, Sub, SubAssign}; use engine_traits::{PerfContextKind, PerfLevel}; -use kvproto::kvrpcpb::ScanDetailV2; use lazy_static::lazy_static; use slog_derive::KV; +use tikv_util::time::Instant; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; use crate::{ - perf_context_metrics::{ - APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC, STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC, - }, - raw_util, set_perf_flags, set_perf_level, PerfContext as RawPerfContext, PerfFlag, PerfFlags, + perf_context_metrics::*, set_perf_flags, set_perf_level, util, PerfContext as RawPerfContext, + PerfFlag, PerfFlags, }; macro_rules! report_write_perf_context { - ($ctx: expr, $metric: ident) => { + ($ctx:expr, $metric:ident) => { if $ctx.perf_level != PerfLevel::Disable { $ctx.write = WritePerfContext::capture(); - observe_perf_context_type!($ctx, $metric, write_wal_time); - observe_perf_context_type!($ctx, $metric, write_memtable_time); - observe_perf_context_type!($ctx, $metric, db_mutex_lock_nanos); - observe_perf_context_type!($ctx, $metric, pre_and_post_process); - observe_perf_context_type!($ctx, $metric, write_thread_wait); - observe_perf_context_type!($ctx, $metric, write_scheduling_flushes_compactions_time); - observe_perf_context_type!($ctx, $metric, db_condition_wait_nanos); - observe_perf_context_type!($ctx, $metric, write_delay_time); + observe_write_time!($ctx, $metric, write_wal_time); + observe_write_time!($ctx, $metric, write_memtable_time); + observe_write_time!($ctx, $metric, db_mutex_lock_nanos); + observe_write_time!($ctx, $metric, pre_and_post_process); + observe_write_time!($ctx, $metric, write_thread_wait); + observe_write_time!($ctx, $metric, write_scheduling_flushes_compactions_time); + observe_write_time!($ctx, $metric, db_condition_wait_nanos); + observe_write_time!($ctx, $metric, write_delay_time); } }; } -macro_rules! observe_perf_context_type { - ($s:expr, $metric: expr, $v:ident) => { - $metric.$v.observe(($s.write.$v) as f64 / 1e9); +macro_rules! observe_write_time { + ($ctx:expr, $metric:expr, $v:ident) => { + $metric.$v.observe(($ctx.write.$v) as f64 / 1e9); }; } @@ -136,12 +135,13 @@ pub struct ReadPerfContext { } impl ReadPerfContext { - pub fn write_scan_detail(&self, detail_v2: &mut ScanDetailV2) { - detail_v2.set_rocksdb_delete_skipped_count(self.internal_delete_skipped_count); - detail_v2.set_rocksdb_key_skipped_count(self.internal_key_skipped_count); - detail_v2.set_rocksdb_block_cache_hit_count(self.block_cache_hit_count); - detail_v2.set_rocksdb_block_read_count(self.block_read_count); - detail_v2.set_rocksdb_block_read_byte(self.block_read_byte); + fn report_to_tracker(&self, tracker: &mut Tracker) { + tracker.metrics.block_cache_hit_count += self.block_cache_hit_count; + tracker.metrics.block_read_byte += self.block_read_byte; + tracker.metrics.block_read_count += self.block_read_count; + tracker.metrics.block_read_nanos += self.block_read_time; + tracker.metrics.deleted_key_skipped_count += self.internal_delete_skipped_count; + tracker.metrics.internal_key_skipped_count += self.internal_key_skipped_count; } } @@ -159,33 +159,40 @@ pub struct WritePerfContext { #[derive(Debug)] pub struct PerfContextStatistics { - pub perf_level: PerfLevel, - pub kind: PerfContextKind, - pub read: ReadPerfContext, - pub write: WritePerfContext, + perf_level: PerfLevel, + kind: PerfContextKind, + read: ReadPerfContext, + write: WritePerfContext, + last_flush_time: Instant, } +const FLUSH_METRICS_INTERVAL: Duration = Duration::from_secs(2); + impl PerfContextStatistics { - /// Create an instance which stores instant statistics values, retrieved at creation. + /// Create an instance which stores instant statistics values, retrieved at + /// creation. pub fn new(perf_level: PerfLevel, kind: PerfContextKind) -> Self { PerfContextStatistics { perf_level, kind, read: Default::default(), write: Default::default(), + last_flush_time: Instant::now_coarse(), } } fn apply_perf_settings(&self) { if self.perf_level == PerfLevel::Uninitialized { match self.kind { - PerfContextKind::GenericRead => set_perf_flags(&*DEFAULT_READ_PERF_FLAGS), + PerfContextKind::Storage(_) | PerfContextKind::Coprocessor(_) => { + set_perf_flags(&DEFAULT_READ_PERF_FLAGS) + } PerfContextKind::RaftstoreStore | PerfContextKind::RaftstoreApply => { - set_perf_flags(&*DEFAULT_WRITE_PERF_FLAGS) + set_perf_flags(&DEFAULT_WRITE_PERF_FLAGS) } } } else { - set_perf_level(raw_util::to_raw_perf_level(self.perf_level)); + set_perf_level(util::to_raw_perf_level(self.perf_level)); } } @@ -198,23 +205,197 @@ impl PerfContextStatistics { self.apply_perf_settings(); } - pub fn report(&mut self) { + pub fn report(&mut self, trackers: &[TrackerToken]) { match self.kind { PerfContextKind::RaftstoreApply => { report_write_perf_context!(self, APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.apply_mutex_lock_nanos = self.write.db_mutex_lock_nanos; + t.metrics.apply_thread_wait_nanos = self.write.write_thread_wait; + t.metrics.apply_write_wal_nanos = self.write.write_wal_time; + t.metrics.apply_write_memtable_nanos = self.write.write_memtable_time; + }); + } } PerfContextKind::RaftstoreStore => { report_write_perf_context!(self, STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.store_mutex_lock_nanos = self.write.db_mutex_lock_nanos; + t.metrics.store_thread_wait_nanos = self.write.write_thread_wait; + t.metrics.store_write_wal_nanos = self.write.write_wal_time; + t.metrics.store_write_memtable_nanos = self.write.write_memtable_time; + }); + } } - PerfContextKind::GenericRead => { - // TODO: Currently, metrics about reading is reported in other ways. - // It is better to unify how to report the perf metrics. - // - // Here we only record the PerfContext data into the fields. - self.read = ReadPerfContext::capture(); + PerfContextKind::Storage(_) | PerfContextKind::Coprocessor(_) => { + let perf_context = ReadPerfContext::capture(); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| perf_context.report_to_tracker(t)); + } + self.read += perf_context; + self.flush_read_metrics(); } } } + + fn flush_read_metrics(&mut self) { + if self.last_flush_time.saturating_elapsed() < FLUSH_METRICS_INTERVAL { + return; + } + self.last_flush_time = Instant::now_coarse(); + let ctx = mem::take(&mut self.read); + let (v, tag) = match self.kind { + PerfContextKind::Storage(tag) => (&*STORAGE_ROCKSDB_PERF_COUNTER, tag), + PerfContextKind::Coprocessor(tag) => (&*COPR_ROCKSDB_PERF_COUNTER, tag), + _ => unreachable!(), + }; + v.get_metric_with_label_values(&[tag, "user_key_comparison_count"]) + .unwrap() + .inc_by(ctx.user_key_comparison_count); + v.get_metric_with_label_values(&[tag, "block_cache_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_hit_count); + v.get_metric_with_label_values(&[tag, "block_read_count"]) + .unwrap() + .inc_by(ctx.block_read_count); + v.get_metric_with_label_values(&[tag, "block_read_byte"]) + .unwrap() + .inc_by(ctx.block_read_byte); + v.get_metric_with_label_values(&[tag, "block_read_time"]) + .unwrap() + .inc_by(ctx.block_read_time); + v.get_metric_with_label_values(&[tag, "block_cache_index_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_index_hit_count); + v.get_metric_with_label_values(&[tag, "index_block_read_count"]) + .unwrap() + .inc_by(ctx.index_block_read_count); + v.get_metric_with_label_values(&[tag, "block_cache_filter_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_filter_hit_count); + v.get_metric_with_label_values(&[tag, "filter_block_read_count"]) + .unwrap() + .inc_by(ctx.filter_block_read_count); + v.get_metric_with_label_values(&[tag, "block_checksum_time"]) + .unwrap() + .inc_by(ctx.block_checksum_time); + v.get_metric_with_label_values(&[tag, "block_decompress_time"]) + .unwrap() + .inc_by(ctx.block_decompress_time); + v.get_metric_with_label_values(&[tag, "get_read_bytes"]) + .unwrap() + .inc_by(ctx.get_read_bytes); + v.get_metric_with_label_values(&[tag, "iter_read_bytes"]) + .unwrap() + .inc_by(ctx.iter_read_bytes); + v.get_metric_with_label_values(&[tag, "internal_key_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_key_skipped_count); + v.get_metric_with_label_values(&[tag, "internal_delete_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_delete_skipped_count); + v.get_metric_with_label_values(&[tag, "internal_recent_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_recent_skipped_count); + v.get_metric_with_label_values(&[tag, "get_snapshot_time"]) + .unwrap() + .inc_by(ctx.get_snapshot_time); + v.get_metric_with_label_values(&[tag, "get_from_memtable_time"]) + .unwrap() + .inc_by(ctx.get_from_memtable_time); + v.get_metric_with_label_values(&[tag, "get_from_memtable_count"]) + .unwrap() + .inc_by(ctx.get_from_memtable_count); + v.get_metric_with_label_values(&[tag, "get_post_process_time"]) + .unwrap() + .inc_by(ctx.get_post_process_time); + v.get_metric_with_label_values(&[tag, "get_from_output_files_time"]) + .unwrap() + .inc_by(ctx.get_from_output_files_time); + v.get_metric_with_label_values(&[tag, "seek_on_memtable_time"]) + .unwrap() + .inc_by(ctx.seek_on_memtable_time); + v.get_metric_with_label_values(&[tag, "seek_on_memtable_count"]) + .unwrap() + .inc_by(ctx.seek_on_memtable_count); + v.get_metric_with_label_values(&[tag, "next_on_memtable_count"]) + .unwrap() + .inc_by(ctx.next_on_memtable_count); + v.get_metric_with_label_values(&[tag, "prev_on_memtable_count"]) + .unwrap() + .inc_by(ctx.prev_on_memtable_count); + v.get_metric_with_label_values(&[tag, "seek_child_seek_time"]) + .unwrap() + .inc_by(ctx.seek_child_seek_time); + v.get_metric_with_label_values(&[tag, "seek_child_seek_count"]) + .unwrap() + .inc_by(ctx.seek_child_seek_count); + v.get_metric_with_label_values(&[tag, "seek_min_heap_time"]) + .unwrap() + .inc_by(ctx.seek_min_heap_time); + v.get_metric_with_label_values(&[tag, "seek_max_heap_time"]) + .unwrap() + .inc_by(ctx.seek_max_heap_time); + v.get_metric_with_label_values(&[tag, "seek_internal_seek_time"]) + .unwrap() + .inc_by(ctx.seek_internal_seek_time); + v.get_metric_with_label_values(&[tag, "db_mutex_lock_nanos"]) + .unwrap() + .inc_by(ctx.db_mutex_lock_nanos); + v.get_metric_with_label_values(&[tag, "db_condition_wait_nanos"]) + .unwrap() + .inc_by(ctx.db_condition_wait_nanos); + v.get_metric_with_label_values(&[tag, "read_index_block_nanos"]) + .unwrap() + .inc_by(ctx.read_index_block_nanos); + v.get_metric_with_label_values(&[tag, "read_filter_block_nanos"]) + .unwrap() + .inc_by(ctx.read_filter_block_nanos); + v.get_metric_with_label_values(&[tag, "new_table_block_iter_nanos"]) + .unwrap() + .inc_by(ctx.new_table_block_iter_nanos); + v.get_metric_with_label_values(&[tag, "new_table_iterator_nanos"]) + .unwrap() + .inc_by(ctx.new_table_iterator_nanos); + v.get_metric_with_label_values(&[tag, "block_seek_nanos"]) + .unwrap() + .inc_by(ctx.block_seek_nanos); + v.get_metric_with_label_values(&[tag, "find_table_nanos"]) + .unwrap() + .inc_by(ctx.find_table_nanos); + v.get_metric_with_label_values(&[tag, "bloom_memtable_hit_count"]) + .unwrap() + .inc_by(ctx.bloom_memtable_hit_count); + v.get_metric_with_label_values(&[tag, "bloom_memtable_miss_count"]) + .unwrap() + .inc_by(ctx.bloom_memtable_miss_count); + v.get_metric_with_label_values(&[tag, "bloom_sst_hit_count"]) + .unwrap() + .inc_by(ctx.bloom_sst_hit_count); + v.get_metric_with_label_values(&[tag, "bloom_sst_miss_count"]) + .unwrap() + .inc_by(ctx.bloom_sst_miss_count); + v.get_metric_with_label_values(&[tag, "get_cpu_nanos"]) + .unwrap() + .inc_by(ctx.get_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_next_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_next_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_prev_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_prev_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_seek_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_seek_cpu_nanos); + v.get_metric_with_label_values(&[tag, "encrypt_data_nanos"]) + .unwrap() + .inc_by(ctx.encrypt_data_nanos); + v.get_metric_with_label_values(&[tag, "decrypt_data_nanos"]) + .unwrap() + .inc_by(ctx.decrypt_data_nanos); + } } pub trait PerfContextFields: Debug + Clone + Copy + Sub + slog::KV { diff --git a/components/engine_rocks/src/perf_context_metrics.rs b/components/engine_rocks/src/perf_context_metrics.rs index 5d58066500f..d384fc96dc9 100644 --- a/components/engine_rocks/src/perf_context_metrics.rs +++ b/components/engine_rocks/src/perf_context_metrics.rs @@ -26,14 +26,26 @@ lazy_static! { "tikv_raftstore_apply_perf_context_time_duration_secs", "Bucketed histogram of request wait time duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = register_histogram_vec!( "tikv_raftstore_store_perf_context_time_duration_secs", "Bucketed histogram of request wait time duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() + ) + .unwrap(); + pub static ref STORAGE_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( + "tikv_storage_rocksdb_perf", + "Total number of RocksDB internal operations from PerfContext", + &["req", "metric"] + ) + .unwrap(); + pub static ref COPR_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( + "tikv_coprocessor_rocksdb_perf", + "Total number of RocksDB internal operations from PerfContext", + &["req", "metric"] ) .unwrap(); pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration = diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 47b48d2fc5c..a95a9aecf7b 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -8,7 +8,8 @@ use std::{ u64, }; -use engine_traits::{MvccProperties, Range}; +use api_version::{ApiV2, KeyMode, KvFormat}; +use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties, Range}; use rocksdb::{ DBEntryType, TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, UserCollectedProperties, @@ -130,12 +131,6 @@ impl<'a> DecodeProperties for UserCollectedPropertiesDecoder<'a> { } } -#[derive(Debug, Clone, PartialEq, Eq, Copy)] -pub enum RangeOffsetKind { - Size, - Keys, -} - #[derive(Debug, Default, Clone, Copy)] pub struct RangeOffsets { pub size: u64, @@ -386,7 +381,8 @@ impl TablePropertiesCollectorFactory for RangeProperti } } -/// Can only be used for write CF. +/// Can be used for write CF in TiDB & TxnKV scenario, or be used for default CF +/// in RawKV scenario. pub struct MvccPropertiesCollector { props: MvccProperties, last_row: Vec, @@ -394,10 +390,12 @@ pub struct MvccPropertiesCollector { row_versions: u64, cur_index_handle: IndexHandle, row_index_handles: IndexHandles, + key_mode: KeyMode, // Use KeyMode::Txn for both TiDB & TxnKV, KeyMode::Raw for RawKV. + current_ts: u64, } impl MvccPropertiesCollector { - fn new() -> MvccPropertiesCollector { + fn new(key_mode: KeyMode) -> MvccPropertiesCollector { MvccPropertiesCollector { props: MvccProperties::new(), last_row: Vec::new(), @@ -405,6 +403,8 @@ impl MvccPropertiesCollector { row_versions: 0, cur_index_handle: IndexHandle::default(), row_index_handles: IndexHandles::new(), + key_mode, + current_ts: ttl_current_ts(), } } } @@ -452,18 +452,34 @@ impl TablePropertiesCollector for MvccPropertiesCollector { self.props.max_row_versions = self.row_versions; } - let write_type = match Write::parse_type(value) { - Ok(v) => v, - Err(_) => { - self.num_errors += 1; - return; + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + } + Err(_) => { + self.num_errors += 1; + } } - }; + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return; + } + }; - match write_type { - WriteType::Put => self.props.num_puts += 1, - WriteType::Delete => self.props.num_deletes += 1, - _ => {} + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} + } } // Add new row. @@ -493,13 +509,24 @@ impl TablePropertiesCollector for MvccPropertiesCollector { } } -/// Can only be used for write CF. +/// Can be used for write CF of TiDB/TxnKV, default CF of RawKV. #[derive(Default)] pub struct MvccPropertiesCollectorFactory {} impl TablePropertiesCollectorFactory for MvccPropertiesCollectorFactory { fn create_table_properties_collector(&mut self, _: u32) -> MvccPropertiesCollector { - MvccPropertiesCollector::new() + MvccPropertiesCollector::new(KeyMode::Txn) + } +} + +#[derive(Default)] +pub struct RawMvccPropertiesCollectorFactory {} + +impl TablePropertiesCollectorFactory + for RawMvccPropertiesCollectorFactory +{ + fn create_table_properties_collector(&mut self, _: u32) -> MvccPropertiesCollector { + MvccPropertiesCollector::new(KeyMode::Raw) } } @@ -536,9 +563,8 @@ pub fn get_range_entries_and_versions( #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_traits::{CF_WRITE, LARGE_CFS}; + use api_version::RawValue; + use engine_traits::{MiscExt, SyncMutable, CF_WRITE, LARGE_CFS}; use rand::Rng; use tempfile::Builder; use test::Bencher; @@ -546,9 +572,8 @@ mod tests { use super::*; use crate::{ - compat::Compat, - raw::{ColumnFamilyOptions, DBEntryType, DBOptions, TablePropertiesCollector, Writable}, - raw_util::CFOptions, + raw::{DBEntryType, TablePropertiesCollector}, + RocksCfOptions, RocksDbOptions, }; #[allow(clippy::many_single_char_names)] @@ -566,15 +591,18 @@ mod tests { ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), - // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + 9),keys(4,5) + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) ("l", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), ("m", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), - //handle "m": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE,offset = 11+DEFAULT_PROP_KEYS_INDEX_DISTANCE + // handle "m": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE,offset = + // 11+DEFAULT_PROP_KEYS_INDEX_DISTANCE ("n", 1, DEFAULT_PROP_KEYS_INDEX_DISTANCE), - //handle "n": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE, offset = 11+2*DEFAULT_PROP_KEYS_INDEX_DISTANCE + // handle "n": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE, offset = + // 11+2*DEFAULT_PROP_KEYS_INDEX_DISTANCE ("o", 1, 1), // handle "o": keys = 1, offset = 12 + 2*DEFAULT_PROP_KEYS_INDEX_DISTANCE ]; @@ -665,7 +693,8 @@ mod tests { ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), - // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + 9),keys(4,5) + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) @@ -714,18 +743,15 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::new(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = RocksDbOptions::default(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_level_zero_file_num_compaction_trigger(10); cf_opts.add_table_properties_collector_factory( "tikv.mvcc-properties-collector", MvccPropertiesCollectorFactory::default(), ); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let db = Arc::new(crate::raw_util::new_engine_opt(path_str, db_opts, cfs_opts).unwrap()); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); + let db = crate::util::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let cases = ["a", "b", "c"]; for &key in &cases { @@ -734,22 +760,21 @@ mod tests { .append_ts(2.into()) .as_encoded(), ); - let write_cf = db.cf_handle(CF_WRITE).unwrap(); - db.put_cf(write_cf, &k1, b"v1").unwrap(); - db.delete_cf(write_cf, &k1).unwrap(); + db.put_cf(CF_WRITE, &k1, b"v1").unwrap(); + db.delete_cf(CF_WRITE, &k1).unwrap(); let key = keys::data_key( Key::from_raw(key.as_bytes()) .append_ts(3.into()) .as_encoded(), ); - db.put_cf(write_cf, &key, b"v2").unwrap(); - db.flush_cf(write_cf, true).unwrap(); + db.put_cf(CF_WRITE, &key, b"v2").unwrap(); + db.flush_cf(CF_WRITE, true).unwrap(); } let start_keys = keys::data_key(&[]); let end_keys = keys::data_end_key(&[]); let (entries, versions) = - get_range_entries_and_versions(db.c(), CF_WRITE, &start_keys, &end_keys).unwrap(); + get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); assert_eq!(entries, (cases.len() * 2) as u64); assert_eq!(versions, cases.len() as u64); } @@ -767,7 +792,7 @@ mod tests { ("ef", 6, WriteType::Put, DBEntryType::Delete), ("gh", 7, WriteType::Delete, DBEntryType::Put), ]; - let mut collector = MvccPropertiesCollector::new(); + let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); for &(key, ts, write_type, entry_type) in &cases { let ts = ts.into(); let k = Key::from_raw(key.as_bytes()).append_ts(ts); @@ -786,6 +811,42 @@ mod tests { assert_eq!(props.max_row_versions, 3); } + #[test] + fn test_mvcc_properties_rawkv_mode() { + let test_raws = vec![ + (b"r\0a", 1, false, u64::MAX), + (b"r\0a", 5, false, u64::MAX), + (b"r\0a", 7, false, u64::MAX), + (b"r\0b", 1, false, u64::MAX), + (b"r\0b", 1, true, u64::MAX), + (b"r\0c", 1, true, 10), + (b"r\0d", 1, true, 10), + ]; + + let mut collector = MvccPropertiesCollector::new(KeyMode::Raw); + for &(key, ts, is_delete, expire_ts) in &test_raws { + let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); + let k = keys::data_key(encode_key.as_encoded()); + let v = ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(expire_ts), + is_delete, + }); + collector.add(&k, &v, DBEntryType::Put, 0, 0); + } + + let result = UserProperties(collector.finish()); + + let props = RocksMvccProperties::decode(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_deletes, 3); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + } + #[bench] fn bench_mvcc_properties(b: &mut Bencher) { let ts = 1.into(); @@ -799,7 +860,7 @@ mod tests { entries.push((k, w.as_ref().to_bytes())); } - let mut collector = MvccPropertiesCollector::new(); + let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); b.iter(|| { for &(ref k, ref v) in &entries { collector.add(k, v, DBEntryType::Put, 0, 0); diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index e081d057191..a0a5acd5dd8 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -3,15 +3,20 @@ // #[PerformanceCriticalPath] use engine_traits::{ Error, Iterable, KvEngine, MiscExt, Mutable, Peekable, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGCTask, Result, SyncMutable, WriteBatch, - WriteBatchExt, WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, + RaftEngineReadOnly, RaftLogBatch, Result, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, + RAFT_LOG_MULTI_GET_CNT, +}; +use kvproto::{ + metapb::Region, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, }; -use kvproto::raft_serverpb::RaftLocalState; use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{box_err, box_try}; -use crate::{util, RocksEngine, RocksWriteBatch}; +use crate::{util, RocksEngine, RocksWriteBatchVec}; impl RaftEngineReadOnly for RocksEngine { fn get_raft_state(&self, raft_group_id: u64) -> Result> { @@ -35,7 +40,8 @@ impl RaftEngineReadOnly for RocksEngine { let (max_size, mut total_size, mut count) = (max_size.unwrap_or(usize::MAX), 0, 0); if high - low <= RAFT_LOG_MULTI_GET_CNT { - // If election happens in inactive regions, they will just try to fetch one empty log. + // If election happens in inactive regions, they will just try to fetch one + // empty log. for i in low..high { if total_size > 0 && total_size >= max_size { break; @@ -61,6 +67,7 @@ impl RaftEngineReadOnly for RocksEngine { let start_key = keys::raft_log_key(region_id, low); let end_key = keys::raft_log_key(region_id, high); self.scan( + CF_DEFAULT, &start_key, &end_key, true, // fill_cache @@ -105,6 +112,7 @@ impl RaftEngineReadOnly for RocksEngine { let start_key = keys::raft_log_key(region_id, 0); let end_key = keys::raft_log_key(region_id, u64::MAX); self.scan( + CF_DEFAULT, &start_key, &end_key, false, // fill_cache @@ -117,6 +125,54 @@ impl RaftEngineReadOnly for RocksEngine { )?; Ok(()) } + + fn is_empty(&self) -> Result { + let mut is_empty = true; + self.scan(CF_DEFAULT, b"", b"", false, |_, _| { + is_empty = false; + Ok(false) + })?; + + Ok(is_empty) + } + + fn get_store_ident(&self) -> Result> { + self.get_msg_cf(CF_DEFAULT, keys::STORE_IDENT_KEY) + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + self.get_msg_cf(CF_DEFAULT, keys::PREPARE_BOOTSTRAP_KEY) + } + + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn get_region_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_apply_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { + panic!() + } + + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } + + fn get_recover_state(&self) -> Result> { + self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) + } } impl RaftEngineDebug for RocksEngine { @@ -127,6 +183,7 @@ impl RaftEngineDebug for RocksEngine { let start_key = keys::raft_log_key(raft_group_id, 0); let end_key = keys::raft_log_key(raft_group_id, u64::MAX); self.scan( + CF_DEFAULT, &start_key, &end_key, false, // fill_cache @@ -145,12 +202,12 @@ impl RocksEngine { raft_group_id: u64, mut from: u64, to: u64, - raft_wb: &mut RocksWriteBatch, + raft_wb: &mut RocksWriteBatchVec, ) -> Result { if from == 0 { let start_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); - match self.seek(&start_key)? { + match self.seek(CF_DEFAULT, &start_key)? { Some((k, _)) if k.starts_with(&prefix) => from = box_try!(keys::raft_log_index(&k)), // No need to gc. _ => return Ok(0), @@ -176,10 +233,10 @@ impl RocksEngine { // for all KvEngines, but is currently implemented separately for // every engine. impl RaftEngine for RocksEngine { - type LogBatch = RocksWriteBatch; + type LogBatch = RocksWriteBatchVec; fn log_batch(&self, capacity: usize) -> Self::LogBatch { - RocksWriteBatch::with_capacity(self, capacity) + RocksWriteBatchVec::with_unit_capacity(self, capacity) } fn sync(&self) -> Result<()> { @@ -217,11 +274,13 @@ impl RaftEngine for RocksEngine { batch: &mut Self::LogBatch, ) -> Result<()> { batch.delete(&keys::raft_state_key(raft_group_id))?; + batch.delete(&keys::region_state_key(raft_group_id))?; + batch.delete(&keys::apply_state_key(raft_group_id))?; if first_index == 0 { let seek_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); fail::fail_point!("engine_rocks_raft_engine_clean_seek", |_| Ok(())); - if let Some((key, _)) = self.seek(&seek_key)? { + if let Some((key, _)) = self.seek(CF_DEFAULT, &seek_key)? { if !key.starts_with(&prefix) { // No raft logs for the raft group. return Ok(()); @@ -243,56 +302,24 @@ impl RaftEngine for RocksEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut wb = self.write_batch(); - let buf = Vec::with_capacity(1024); - wb.append_impl(raft_group_id, &entries, buf)?; - self.consume(&mut wb, false) - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - self.put_msg(&keys::raft_state_key(raft_group_id), state) - } - - fn batch_gc(&self, groups: Vec) -> Result { - let mut total = 0; - let mut raft_wb = self.write_batch_with_cap(4 * 1024); - for task in groups { - total += self.gc_impl(task.raft_group_id, task.from, task.to, &mut raft_wb)?; - } - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) - } - - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - let mut raft_wb = self.write_batch_with_cap(1024); - let total = self.gc_impl(raft_group_id, from, to, &mut raft_wb)?; - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) - } - - fn purge_expired_files(&self) -> Result> { - Ok(vec![]) + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + self.gc_impl(raft_group_id, from, to, batch)?; + Ok(()) } - fn has_builtin_entry_cache(&self) -> bool { - false + fn delete_all_but_one_states_before( + &self, + _raft_group_id: u64, + _apply_index: u64, + _batch: &mut Self::LogBatch, + ) -> Result<()> { + panic!() } fn flush_metrics(&self, instance: &str) { KvEngine::flush_metrics(self, instance) } - fn reset_statistics(&self) { - KvEngine::reset_statistics(self) - } - fn dump_stats(&self) -> Result { MiscExt::dump_stats(self) } @@ -303,10 +330,54 @@ impl RaftEngine for RocksEngine { Ok(used_size) } + + fn get_engine_path(&self) -> &str { + self.as_inner().path() + } + + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + let start_key = keys::REGION_META_MIN_KEY; + let end_key = keys::REGION_META_MAX_KEY; + let mut err = None; + self.scan(CF_DEFAULT, start_key, end_key, false, |key, _| { + let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); + if suffix != keys::REGION_STATE_SUFFIX { + return Ok(true); + } + + match f(region_id) { + Ok(()) => Ok(true), + Err(e) => { + err = Some(e); + Ok(false) + } + } + })?; + match err { + None => Ok(()), + Some(e) => Err(e), + } + } } -impl RaftLogBatch for RocksWriteBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { +impl RaftLogBatch for RocksWriteBatchVec { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + for index in last.get_index() + 1..overwrite_to { + let key = keys::raft_log_key(raft_group_id, index); + self.delete(&key).unwrap(); + } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -314,13 +385,6 @@ impl RaftLogBatch for RocksWriteBatch { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - for index in from..to { - let key = keys::raft_log_key(raft_group_id, index); - self.delete(&key).unwrap(); - } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } @@ -336,9 +400,64 @@ impl RaftLogBatch for RocksWriteBatch { fn merge(&mut self, src: Self) -> Result<()> { WriteBatch::merge(self, src) } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + self.put_msg(keys::STORE_IDENT_KEY, ident) + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + self.put_msg(keys::PREPARE_BOOTSTRAP_KEY, region) + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + self.delete(keys::PREPARE_BOOTSTRAP_KEY) + } + + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn put_region_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RaftApplyState, + ) -> Result<()> { + panic!() + } + + fn put_flushed_index( + &mut self, + _raft_group_id: u64, + _cf: &str, + _tablet_index: u64, + _apply_index: u64, + ) -> Result<()> { + panic!() + } + + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) + } } -impl RocksWriteBatch { +impl RocksWriteBatchVec { fn append_impl( &mut self, raft_group_id: u64, diff --git a/components/engine_rocks/src/range_properties.rs b/components/engine_rocks/src/range_properties.rs index fcd0d2fa863..101a004982a 100644 --- a/components/engine_rocks/src/range_properties.rs +++ b/components/engine_rocks/src/range_properties.rs @@ -58,10 +58,10 @@ impl RangePropertiesExt for RocksEngine { let keys = props.get_approximate_keys_in_range(start_key, end_key); format!( "{}:{}", - Path::new(&*k) + Path::new(k) .file_name() .map(|f| f.to_str().unwrap()) - .unwrap_or(&*k), + .unwrap_or(k), keys ) }) @@ -118,10 +118,10 @@ impl RangePropertiesExt for RocksEngine { let size = props.get_approximate_size_in_range(start_key, end_key); format!( "{}:{}", - Path::new(&*k) + Path::new(k) .file_name() .map(|f| f.to_str().unwrap()) - .unwrap_or(&*k), + .unwrap_or(k), size ) }) @@ -191,8 +191,8 @@ impl RangePropertiesExt for RocksEngine { const SAMPLING_THRESHOLD: usize = 20000; const SAMPLE_RATIO: usize = 1000; - // If there are too many keys, reduce its amount before sorting, or it may take too much - // time to sort the keys. + // If there are too many keys, reduce its amount before sorting, or it may take + // too much time to sort the keys. if keys.len() > SAMPLING_THRESHOLD { let len = keys.len(); keys = keys.into_iter().step_by(len / SAMPLE_RATIO).collect(); @@ -204,7 +204,8 @@ impl RangePropertiesExt for RocksEngine { return Ok(keys); } - // Find `key_count` keys which divides the whole range into `parts` parts evenly. + // Find `key_count` keys which divides the whole range into `parts` parts + // evenly. let mut res = Vec::with_capacity(key_count); let section_len = (keys.len() as f64) / ((key_count + 1) as f64); for i in 1..=key_count { diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index 145931743dd..474137534f8 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -7,14 +7,13 @@ //! crate, but only until the engine interface is completely abstracted. pub use rocksdb::{ - new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, CFHandle, Cache, - ColumnFamilyOptions, CompactOptions, CompactionFilter, CompactionFilterContext, - CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - CompactionJobInfo, CompactionOptions, CompactionPriority, DBBottommostLevelCompaction, - DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBInfoLogLevel, - DBIterator, DBOptions, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, - DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, - MemoryAllocator, PerfContext, Range, ReadOptions, SeekKey, SliceTransform, TableFilter, - TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, TitanDBOptions, - Writable, WriteOptions, DB, + run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, Cache, ChecksumType, CompactOptions, + CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, + CompactionFilterValueType, CompactionJobInfo, CompactionOptions, CompactionPriority, + ConcurrentTaskLimiter, DBBottommostLevelCompaction, DBCompactionFilter, DBCompactionStyle, + DBCompressionType, DBEntryType, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, + DBTableFileCreationReason, DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, + LRUCacheOptions, MemoryAllocator, PerfContext, PrepopulateBlockCache, Range, RateLimiter, + SliceTransform, Statistics, TablePropertiesCollector, TablePropertiesCollectorFactory, + WriteBufferManager, }; diff --git a/components/engine_rocks/src/raw_util.rs b/components/engine_rocks/src/raw_util.rs deleted file mode 100644 index a9f1fcda781..00000000000 --- a/components/engine_rocks/src/raw_util.rs +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. - -//! Functions for constructing the rocksdb crate's `DB` type -//! -//! These are an artifact of refactoring the engine traits and will go away -//! eventually. Prefer to use the versions in the `util` module. - -use std::{fs, path::Path, sync::Arc}; - -use engine_traits::{Result, CF_DEFAULT}; -use rocksdb::{ - load_latest_options, CColumnFamilyDescriptor, ColumnFamilyOptions, DBOptions, Env, DB, -}; -use tikv_util::warn; - -pub struct CFOptions<'a> { - cf: &'a str, - options: ColumnFamilyOptions, -} - -impl<'a> CFOptions<'a> { - pub fn new(cf: &'a str, options: ColumnFamilyOptions) -> CFOptions<'a> { - CFOptions { cf, options } - } -} - -pub fn new_engine( - path: &str, - db_opts: Option, - cfs: &[&str], - opts: Option>>, -) -> Result { - let mut db_opts = match db_opts { - Some(opt) => opt, - None => DBOptions::new(), - }; - db_opts.enable_statistics(true); - let cf_opts = match opts { - Some(opts_vec) => opts_vec, - None => { - let mut default_cfs_opts = Vec::with_capacity(cfs.len()); - for cf in cfs { - default_cfs_opts.push(CFOptions::new(*cf, ColumnFamilyOptions::new())); - } - default_cfs_opts - } - }; - new_engine_opt(path, db_opts, cf_opts) -} - -/// Turns "dynamic level size" off for the existing column family which was off before. -/// Column families are small, HashMap isn't necessary. -fn adjust_dynamic_level_bytes( - cf_descs: &[CColumnFamilyDescriptor], - cf_options: &mut CFOptions<'_>, -) { - if let Some(cf_desc) = cf_descs - .iter() - .find(|cf_desc| cf_desc.name() == cf_options.cf) - { - let existed_dynamic_level_bytes = - cf_desc.options().get_level_compaction_dynamic_level_bytes(); - if existed_dynamic_level_bytes - != cf_options - .options - .get_level_compaction_dynamic_level_bytes() - { - warn!( - "change dynamic_level_bytes for existing column family is danger"; - "old_value" => existed_dynamic_level_bytes, - "new_value" => cf_options.options.get_level_compaction_dynamic_level_bytes(), - ); - } - cf_options - .options - .set_level_compaction_dynamic_level_bytes(existed_dynamic_level_bytes); - } -} - -pub fn new_engine_opt( - path: &str, - mut db_opt: DBOptions, - cfs_opts: Vec>, -) -> Result { - // Creates a new db if it doesn't exist. - if !db_exist(path) { - db_opt.create_if_missing(true); - - let mut cfs_v = vec![]; - let mut cf_opts_v = vec![]; - if let Some(x) = cfs_opts.iter().find(|x| x.cf == CF_DEFAULT) { - cfs_v.push(x.cf); - cf_opts_v.push(x.options.clone()); - } - let mut db = DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cf_opts_v).collect())?; - for x in cfs_opts { - if x.cf == CF_DEFAULT { - continue; - } - db.create_cf((x.cf, x.options))?; - } - - return Ok(db); - } - - db_opt.create_if_missing(false); - - // Lists all column families in current db. - let cfs_list = DB::list_column_families(&db_opt, path)?; - let existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); - let needed: Vec<&str> = cfs_opts.iter().map(|x| x.cf).collect(); - - let cf_descs = if !existed.is_empty() { - let env = match db_opt.env() { - Some(env) => env, - None => Arc::new(Env::default()), - }; - // panic if OPTIONS not found for existing instance? - let (_, tmp) = load_latest_options(path, &env, true) - .unwrap_or_else(|e| panic!("failed to load_latest_options {:?}", e)) - .unwrap_or_else(|| panic!("couldn't find the OPTIONS file")); - tmp - } else { - vec![] - }; - - // If all column families exist, just open db. - if existed == needed { - let mut cfs_v = vec![]; - let mut cfs_opts_v = vec![]; - for mut x in cfs_opts { - adjust_dynamic_level_bytes(&cf_descs, &mut x); - cfs_v.push(x.cf); - cfs_opts_v.push(x.options); - } - - let db = DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cfs_opts_v).collect())?; - return Ok(db); - } - - // Opens db. - let mut cfs_v: Vec<&str> = Vec::new(); - let mut cfs_opts_v: Vec = Vec::new(); - for cf in &existed { - cfs_v.push(cf); - match cfs_opts.iter().find(|x| x.cf == *cf) { - Some(x) => { - let mut tmp = CFOptions::new(x.cf, x.options.clone()); - adjust_dynamic_level_bytes(&cf_descs, &mut tmp); - cfs_opts_v.push(tmp.options); - } - None => { - cfs_opts_v.push(ColumnFamilyOptions::new()); - } - } - } - let cfds = cfs_v.into_iter().zip(cfs_opts_v).collect(); - let mut db = DB::open_cf(db_opt, path, cfds)?; - - // Drops discarded column families. - // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == x).is_none()) { - for cf in cfs_diff(&existed, &needed) { - // Never drop default column families. - if cf != CF_DEFAULT { - db.drop_cf(cf)?; - } - } - - // Creates needed column families if they don't exist. - for cf in cfs_diff(&needed, &existed) { - db.create_cf(( - cf, - cfs_opts - .iter() - .find(|x| x.cf == cf) - .unwrap() - .options - .clone(), - ))?; - } - Ok(db) -} - -pub fn db_exist(path: &str) -> bool { - let path = Path::new(path); - if !path.exists() || !path.is_dir() { - return false; - } - let current_file_path = path.join("CURRENT"); - if !current_file_path.exists() || !current_file_path.is_file() { - return false; - } - - // If path is not an empty directory, and current file exists, we say db exists. If path is not an empty directory - // but db has not been created, `DB::list_column_families` fails and we can clean up - // the directory by this indication. - fs::read_dir(&path).unwrap().next().is_some() -} - -/// Returns a Vec of cf which is in `a' but not in `b'. -fn cfs_diff<'a>(a: &[&'a str], b: &[&str]) -> Vec<&'a str> { - a.iter() - .filter(|x| !b.iter().any(|y| *x == y)) - .cloned() - .collect() -} - -pub fn to_raw_perf_level(level: engine_traits::PerfLevel) -> rocksdb::PerfLevel { - match level { - engine_traits::PerfLevel::Uninitialized => rocksdb::PerfLevel::Uninitialized, - engine_traits::PerfLevel::Disable => rocksdb::PerfLevel::Disable, - engine_traits::PerfLevel::EnableCount => rocksdb::PerfLevel::EnableCount, - engine_traits::PerfLevel::EnableTimeExceptForMutex => { - rocksdb::PerfLevel::EnableTimeExceptForMutex - } - engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { - rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex - } - engine_traits::PerfLevel::EnableTime => rocksdb::PerfLevel::EnableTime, - engine_traits::PerfLevel::OutOfBounds => rocksdb::PerfLevel::OutOfBounds, - } -} - -pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLevel { - match level { - rocksdb::PerfLevel::Uninitialized => engine_traits::PerfLevel::Uninitialized, - rocksdb::PerfLevel::Disable => engine_traits::PerfLevel::Disable, - rocksdb::PerfLevel::EnableCount => engine_traits::PerfLevel::EnableCount, - rocksdb::PerfLevel::EnableTimeExceptForMutex => { - engine_traits::PerfLevel::EnableTimeExceptForMutex - } - rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { - engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex - } - rocksdb::PerfLevel::EnableTime => engine_traits::PerfLevel::EnableTime, - rocksdb::PerfLevel::OutOfBounds => engine_traits::PerfLevel::OutOfBounds, - } -} - -#[cfg(test)] -mod tests { - use engine_traits::CF_DEFAULT; - use rocksdb::{ColumnFamilyOptions, DBOptions, DB}; - use tempfile::Builder; - - use super::*; - - #[test] - fn test_cfs_diff() { - let a = vec!["1", "2", "3"]; - let a_diff_a = cfs_diff(&a, &a); - assert!(a_diff_a.is_empty()); - let b = vec!["4"]; - assert_eq!(a, cfs_diff(&a, &b)); - let c = vec!["4", "5", "3", "6"]; - assert_eq!(vec!["1", "2"], cfs_diff(&a, &c)); - assert_eq!(vec!["4", "5", "6"], cfs_diff(&c, &a)); - let d = vec!["1", "2", "3", "4"]; - let a_diff_d = cfs_diff(&a, &d); - assert!(a_diff_d.is_empty()); - assert_eq!(vec!["4"], cfs_diff(&d, &a)); - } - - #[test] - fn test_new_engine_opt() { - let path = Builder::new() - .prefix("_util_rocksdb_test_check_column_families") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - - // create db when db not exist - let mut cfs_opts = vec![CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new())]; - let mut opts = ColumnFamilyOptions::new(); - opts.set_level_compaction_dynamic_level_bytes(true); - cfs_opts.push(CFOptions::new("cf_dynamic_level_bytes", opts.clone())); - { - let mut db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); - check_dynamic_level_bytes(&mut db); - } - - // add cf1. - let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, opts.clone()), - CFOptions::new("cf_dynamic_level_bytes", opts.clone()), - CFOptions::new("cf1", opts), - ]; - { - let mut db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); - check_dynamic_level_bytes(&mut db); - } - - // drop cf1. - let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new("cf_dynamic_level_bytes", ColumnFamilyOptions::new()), - ]; - { - let mut db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); - check_dynamic_level_bytes(&mut db); - } - - // never drop default cf - let cfs_opts = vec![]; - new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT]); - } - - fn column_families_must_eq(path: &str, excepted: Vec<&str>) { - let opts = DBOptions::new(); - let cfs_list = DB::list_column_families(&opts, path).unwrap(); - - let mut cfs_existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); - let mut cfs_excepted: Vec<&str> = excepted.clone(); - cfs_existed.sort_unstable(); - cfs_excepted.sort_unstable(); - assert_eq!(cfs_existed, cfs_excepted); - } - - fn check_dynamic_level_bytes(db: &mut DB) { - let cf_default = db.cf_handle(CF_DEFAULT).unwrap(); - let tmp_cf_opts = db.get_options_cf(cf_default); - assert!(!tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); - let cf_test = db.cf_handle("cf_dynamic_level_bytes").unwrap(); - let tmp_cf_opts = db.get_options_cf(cf_test); - assert!(tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); - } -} diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 1ce4063298e..522696cb150 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -1,14 +1,15 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CF_DEFAULT; +use collections::HashMap; +use engine_traits::{StatisticsReporter, CF_DEFAULT}; use lazy_static::lazy_static; use prometheus::*; use prometheus_static_metric::*; use rocksdb::{ - DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, DB, + DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, }; -use crate::rocks_metrics_defs::*; +use crate::{engine::RocksEngine, rocks_metrics_defs::*, RocksStatistics}; make_auto_flush_static_metric! { pub label_enum TickerName { @@ -581,12 +582,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .discardable .inc_by(value); } - TickerType::TitanGcSample => { - STORE_ENGINE_BLOB_GC_ACTION - .get(name_enum) - .sample - .inc_by(value); - } TickerType::TitanGcSmallFile => { STORE_ENGINE_BLOB_GC_ACTION .get(name_enum) @@ -611,6 +606,7 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .trigger_next .inc_by(value); } + // TODO: Some tickers are ignored. _ => {} } } @@ -910,214 +906,351 @@ pub fn flush_engine_histogram_metrics(t: HistType, value: HistogramData, name: & } } -pub fn flush_engine_iostall_properties(engine: &DB, name: &str) { - let stall_num = ROCKSDB_IOSTALL_KEY.len(); - let mut counter = vec![0; stall_num]; - for cf in engine.cf_names() { - let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - if let Some(info) = engine.get_map_property_cf(handle, ROCKSDB_CFSTATS) { - for i in 0..stall_num { - let value = info.get_property_int_value(ROCKSDB_IOSTALL_KEY[i]); - counter[i] += value as i64; - } - } else { - return; - } - } - for i in 0..stall_num { - STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC - .with_label_values(&[name, ROCKSDB_IOSTALL_TYPE[i]]) - .set(counter[i]); - } +#[derive(Default, Clone)] +struct CfLevelStats { + num_files: Option, + // sum(compression_ratio_i * num_files_i) + weighted_compression_ratio: Option, + num_blob_files: Option, } -pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool) { - for cf in engine.cf_names() { - let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - // It is important to monitor each cf's size, especially the "raft" and "lock" column - // families. - let cf_used_size = crate::util::get_engine_cf_used_size(engine, handle); - STORE_ENGINE_SIZE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(cf_used_size as i64); - - if !shared_block_cache { - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(block_cache_usage as i64); - } - - let blob_cache_usage = engine.get_blob_cache_usage_cf(handle); - STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(blob_cache_usage as i64); - - // TODO: find a better place to record these metrics. - // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB - // For index and filter blocks memory - if let Some(readers_mem) = engine.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { - STORE_ENGINE_MEMORY_GAUGE_VEC - .with_label_values(&[name, cf, "readers-mem"]) - .set(readers_mem as i64); - } - - // For memtable - if let Some(mem_table) = engine.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) - { - STORE_ENGINE_MEMORY_GAUGE_VEC - .with_label_values(&[name, cf, "mem-tables"]) - .set(mem_table as i64); - } +#[derive(Default)] +struct CfStats { + used_size: Option, + blob_cache_size: Option, + readers_mem: Option, + mem_tables: Option, + num_keys: Option, + pending_compaction_bytes: Option, + num_immutable_mem_table: Option, + live_blob_size: Option, + num_live_blob_file: Option, + num_obsolete_blob_file: Option, + live_blob_file_size: Option, + obsolete_blob_file_size: Option, + blob_file_discardable_ratio_le0: Option, + blob_file_discardable_ratio_le20: Option, + blob_file_discardable_ratio_le50: Option, + blob_file_discardable_ratio_le80: Option, + blob_file_discardable_ratio_le100: Option, + levels: Vec, +} - // TODO: add cache usage and pinned usage. +#[derive(Default)] +struct DbStats { + num_snapshots: Option, + oldest_snapshot_time: Option, + block_cache_size: Option, + stall_num: Option<[u64; ROCKSDB_IOSTALL_KEY.len()]>, +} - if let Some(num_keys) = engine.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { - STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC - .with_label_values(&[name, cf]) - .set(num_keys as i64); - } +pub struct RocksStatisticsReporter { + name: String, + db_stats: DbStats, + cf_stats: HashMap, +} - // Pending compaction bytes - if let Some(pending_compaction_bytes) = - crate::util::get_cf_pending_compaction_bytes(engine, handle) - { - STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC - .with_label_values(&[name, cf]) - .set(pending_compaction_bytes as i64); +impl StatisticsReporter for RocksStatisticsReporter { + fn new(name: &str) -> Self { + Self { + name: name.to_owned(), + db_stats: DbStats::default(), + cf_stats: HashMap::default(), } + } - let opts = engine.get_options_cf(handle); - for level in 0..opts.get_num_levels() { - // Compression ratio at levels + fn collect(&mut self, engine: &RocksEngine) { + let db = engine.as_inner(); + for cf in db.cf_names() { + let cf_stats = self.cf_stats.entry(cf.to_owned()).or_default(); + let handle = crate::util::get_cf_handle(db, cf).unwrap(); + // It is important to monitor each cf's size, especially the "raft" and "lock" + // column families. + *cf_stats.used_size.get_or_insert_default() += + crate::util::get_engine_cf_used_size(db, handle); + *cf_stats.blob_cache_size.get_or_insert_default() += db.get_blob_cache_usage_cf(handle); + // TODO: find a better place to record these metrics. + // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB + // For index and filter blocks memory + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { + *cf_stats.readers_mem.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables.get_or_insert_default() += v; + } + // TODO: add cache usage and pinned usage. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { + *cf_stats.num_keys.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_pending_compaction_bytes(db, handle) { + *cf_stats.pending_compaction_bytes.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_num_immutable_mem_table(db, handle) { + *cf_stats.num_immutable_mem_table.get_or_insert_default() += v; + } + // Titan. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { + *cf_stats.live_blob_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { + *cf_stats.num_live_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) + { + *cf_stats.num_obsolete_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { + *cf_stats.live_blob_file_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) + { + *cf_stats.obsolete_blob_file_size.get_or_insert_default() += v; + } if let Some(v) = - crate::util::get_engine_compression_ratio_at_level(engine, handle, level) + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) { - STORE_ENGINE_COMPRESSION_RATIO_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v); + *cf_stats + .blob_file_discardable_ratio_le0 + .get_or_insert_default() += v; } - - // Num files at levels - if let Some(v) = crate::util::get_cf_num_files_at_level(engine, handle, level) { - STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v as i64); + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le20 + .get_or_insert_default() += v; } - - // Titan Num blob files at levels - if let Some(v) = crate::util::get_cf_num_blob_files_at_level(engine, handle, level) { - STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v as i64); + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le50 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le80 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le100 + .get_or_insert_default() += v; + } + // Level stats. + let opts = db.get_options_cf(handle); + if cf_stats.levels.len() < opts.get_num_levels() { + cf_stats + .levels + .resize(opts.get_num_levels(), CfLevelStats::default()); + } + for level in 0..opts.get_num_levels() { + if let Some(num_files) = crate::util::get_cf_num_files_at_level(db, handle, level) { + *cf_stats.levels[level].num_files.get_or_insert_default() += num_files; + if let Some(ratio) = + crate::util::get_engine_compression_ratio_at_level(db, handle, level) + { + *cf_stats.levels[level] + .weighted_compression_ratio + .get_or_insert_default() += num_files as f64 * ratio; + } + } + if let Some(v) = crate::util::get_cf_num_blob_files_at_level(db, handle, level) { + *cf_stats.levels[level] + .num_blob_files + .get_or_insert_default() += v; + } } - } - - // Num immutable mem-table - if let Some(v) = crate::util::get_cf_num_immutable_mem_table(engine, handle) { - STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); - } - // Titan live blob size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { - STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + if let Some(info) = db.get_map_property_cf(handle, ROCKSDB_CFSTATS) { + let stall_num = self.db_stats.stall_num.get_or_insert_default(); + for (key, val) in ROCKSDB_IOSTALL_KEY.iter().zip(stall_num) { + *val += info.get_property_int_value(key); + } + } } - // Titan num live blob file - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { - STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + // For snapshot + *self.db_stats.num_snapshots.get_or_insert_default() += + db.get_property_int(ROCKSDB_NUM_SNAPSHOTS).unwrap_or(0); + let oldest_snapshot_time = + db.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) + .map_or(0, |t| { + let now = time::get_time().sec as u64; + // RocksDB returns 0 if no snapshots. + if t > 0 && now > t { now - t } else { 0 } + }); + if oldest_snapshot_time > self.db_stats.oldest_snapshot_time.unwrap_or(0) { + *self.db_stats.oldest_snapshot_time.get_or_insert_default() = oldest_snapshot_time; } - // Titan num obsolete blob file - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) - { - STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + // Since block cache is shared, getting cache size from any CF/DB is fine. Here + // we get from default CF. + if self.db_stats.block_cache_size.is_none() { + let handle = crate::util::get_cf_handle(db, CF_DEFAULT).unwrap(); + *self.db_stats.block_cache_size.get_or_insert_default() = + db.get_block_cache_usage_cf(handle); } + } - // Titan live blob file size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { - STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); - } + fn flush(&mut self) { + for (cf, cf_stats) in &self.cf_stats { + if let Some(v) = cf_stats.used_size { + STORE_ENGINE_SIZE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_cache_size { + STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.readers_mem { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "readers-mem"]) + .set(v as i64); + } + if let Some(v) = cf_stats.mem_tables { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables"]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_keys { + STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.pending_compaction_bytes { + STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + for (level, level_stats) in cf_stats.levels.iter().enumerate() { + if let Some(num_files) = level_stats.num_files { + STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(num_files as i64); + if num_files > 0 && let Some(ratio) = level_stats.weighted_compression_ratio { + let normalized_compression_ratio = + ratio / num_files as f64; + STORE_ENGINE_COMPRESSION_RATIO_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(normalized_compression_ratio); + } + } + if let Some(v) = level_stats.num_blob_files { + STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(v as i64); + } + } - // Titan obsolete blob file size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) - { - STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + if let Some(v) = cf_stats.num_immutable_mem_table { + STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_live_blob_file { + STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_obsolete_blob_file { + STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_file_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.obsolete_blob_file_size { + STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le0 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le0"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le20 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le20"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le50 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le50"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le80 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le80"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le100 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le100"]) + .set(v as i64); + } } - // Titan blob file discardable ratio - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le0"]) + if let Some(v) = self.db_stats.num_snapshots { + STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC + .with_label_values(&[&self.name]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le20"]) + if let Some(v) = self.db_stats.oldest_snapshot_time { + STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC + .with_label_values(&[&self.name]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le50"]) - .set(v as i64); - } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le80"]) + if let Some(v) = self.db_stats.block_cache_size { + STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, "all"]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le100"]) - .set(v as i64); + if let Some(stall_num) = &self.db_stats.stall_num { + for (ty, val) in ROCKSDB_IOSTALL_TYPE.iter().zip(stall_num) { + STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC + .with_label_values(&[&self.name, ty]) + .set(*val as i64); + } } } +} - // For snapshot - if let Some(n) = engine.get_property_int(ROCKSDB_NUM_SNAPSHOTS) { - STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC - .with_label_values(&[name]) - .set(n as i64); +pub fn flush_engine_statistics(statistics: &RocksStatistics, name: &str, is_titan: bool) { + for t in ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v, name); } - if let Some(t) = engine.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) { - // RocksDB returns 0 if no snapshots. - let now = time::get_time().sec as u64; - let d = if t > 0 && now > t { now - t } else { 0 }; - STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC - .with_label_values(&[name]) - .set(d as i64); + for t in ENGINE_HIST_TYPES { + if let Some(v) = statistics.get_histogram(*t) { + flush_engine_histogram_metrics(*t, v, name); + } } - - if shared_block_cache { - // Since block cache is shared, getting cache size from any CF is fine. Here we get from - // default CF. - let handle = crate::util::get_cf_handle(engine, CF_DEFAULT).unwrap(); - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, "all"]) - .set(block_cache_usage as i64); + if is_titan { + for t in TITAN_ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v, name); + } + for t in TITAN_ENGINE_HIST_TYPES { + if let Some(v) = statistics.get_histogram(*t) { + flush_engine_histogram_metrics(*t, v, name); + } + } } } @@ -1618,8 +1751,7 @@ mod tests { #[test] fn test_flush() { let dir = Builder::new().prefix("test-flush").tempdir().unwrap(); - let engine = - crate::util::new_engine(dir.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let engine = crate::util::new_engine(dir.path().to_str().unwrap(), ALL_CFS).unwrap(); for tp in ENGINE_TICKER_TYPES { flush_engine_ticker_metrics(*tp, 2, "kv"); } @@ -1628,12 +1760,8 @@ mod tests { flush_engine_histogram_metrics(*tp, HistogramData::default(), "kv"); } - let shared_block_cache = false; - flush_engine_properties(engine.as_inner(), "kv", shared_block_cache); - let handle = engine.as_inner().cf_handle("default").unwrap(); - let info = engine - .as_inner() - .get_map_property_cf(handle, ROCKSDB_CFSTATS); - assert!(info.is_some()); + let mut reporter = RocksStatisticsReporter::new("kv"); + reporter.collect(&engine); + reporter.flush(); } } diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index fc23871b90f..042949f1c09 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -138,8 +138,11 @@ pub const TITAN_ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::TitanGcNoNeed, TickerType::TitanGcRemain, TickerType::TitanGcDiscardable, - TickerType::TitanGcSample, TickerType::TitanGcSmallFile, + TickerType::TitanGcLevelMergeMark, + TickerType::TitanGcLevelMergeDelete, + TickerType::TitanGcNoNeed, + TickerType::TitanGcRemain, TickerType::TitanGcFailure, TickerType::TitanGcSuccess, TickerType::TitanGcTriggerNext, diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index e1a0f635286..60a12c4ac6d 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -5,11 +5,14 @@ use std::{ sync::Arc, }; -use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + self, CfNamesExt, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot, +}; use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ - db_vector::RocksDBVector, options::RocksReadOptions, util::get_cf_handle, RocksEngineIterator, + db_vector::RocksDbVector, options::RocksReadOptions, r2e, util::get_cf_handle, + RocksEngineIterator, }; pub struct RocksSnapshot { @@ -31,11 +34,7 @@ impl RocksSnapshot { } } -impl Snapshot for RocksSnapshot { - fn cf_names(&self) -> Vec<&str> { - self.db.cf_names() - } -} +impl Snapshot for RocksSnapshot {} impl Debug for RocksSnapshot { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { @@ -54,19 +53,7 @@ impl Drop for RocksSnapshot { impl Iterable for RocksSnapshot { type Iterator = RocksEngineIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - let opt: RocksReadOptions = opts.into(); - let mut opt = opt.into_raw(); - unsafe { - opt.set_snapshot(&self.snap); - } - Ok(RocksEngineIterator::from_raw(DBIterator::new( - self.db.clone(), - opt, - ))) - } - - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { let opt: RocksReadOptions = opts.into(); let mut opt = opt.into_raw(); unsafe { @@ -82,16 +69,16 @@ impl Iterable for RocksSnapshot { } impl Peekable for RocksSnapshot { - type DBVector = RocksDBVector; + type DbVector = RocksDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { let opt: RocksReadOptions = opts.into(); let mut opt = opt.into_raw(); unsafe { opt.set_snapshot(&self.snap); } - let v = self.db.get_opt(key, &opt)?; - Ok(v.map(RocksDBVector::from_raw)) + let v = self.db.get_opt(key, &opt).map_err(r2e)?; + Ok(v.map(RocksDbVector::from_raw)) } fn get_value_cf_opt( @@ -99,14 +86,20 @@ impl Peekable for RocksSnapshot { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { let opt: RocksReadOptions = opts.into(); let mut opt = opt.into_raw(); unsafe { opt.set_snapshot(&self.snap); } let handle = get_cf_handle(self.db.as_ref(), cf)?; - let v = self.db.get_cf_opt(handle, key, &opt)?; - Ok(v.map(RocksDBVector::from_raw)) + let v = self.db.get_cf_opt(handle, key, &opt).map_err(r2e)?; + Ok(v.map(RocksDbVector::from_raw)) + } +} + +impl CfNamesExt for RocksSnapshot { + fn cf_names(&self) -> Vec<&str> { + self.db.cf_names() } } diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 58f300a8ec2..85c30d74a87 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -1,10 +1,10 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{path::PathBuf, rc::Rc, sync::Arc}; +use std::{path::PathBuf, sync::Arc}; use engine_traits::{ - Error, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SeekKey, - SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, + SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; use kvproto::import_sstpb::SstMeta; @@ -14,9 +14,7 @@ use rocksdb::{ SstFileWriter, DB, }; -// FIXME: Move RocksSeekKey into a common module since -// it's shared between multiple iterators -use crate::{engine::RocksEngine, engine_iterator::RocksSeekKey, options::RocksReadOptions}; +use crate::{engine::RocksEngine, options::RocksReadOptions, r2e}; impl SstExt for RocksEngine { type SstReader = RocksSstReader; @@ -24,11 +22,8 @@ impl SstExt for RocksEngine { type SstWriterBuilder = RocksSstWriterBuilder; } -// FIXME: like in RocksEngineIterator and elsewhere, here we are using -// Rc to avoid putting references in an associated type, which -// requires generic associated types. pub struct RocksSstReader { - inner: Rc, + inner: SstFileReader, } impl RocksSstReader { @@ -51,9 +46,8 @@ impl RocksSstReader { cf_options.set_env(env); } let mut reader = SstFileReader::new(cf_options); - reader.open(path)?; - let inner = Rc::new(reader); - Ok(RocksSstReader { inner }) + reader.open(path).map_err(r2e)?; + Ok(RocksSstReader { inner: reader }) } pub fn compression_name(&self) -> String { @@ -70,63 +64,63 @@ impl SstReader for RocksSstReader { Self::open_with_env(path, None) } fn verify_checksum(&self) -> Result<()> { - self.inner.verify_checksum()?; + self.inner.verify_checksum().map_err(r2e)?; Ok(()) } - fn iter(&self) -> Self::Iterator { - RocksSstIterator(SstFileReader::iter_rc(self.inner.clone())) - } } -impl Iterable for RocksSstReader { - type Iterator = RocksSstIterator; +impl RefIterable for RocksSstReader { + type Iterator<'a> = RocksSstIterator<'a>; - fn iterator_opt(&self, opts: IterOptions) -> Result { + #[inline] + fn iter(&self, opts: IterOptions) -> Result> { let opt: RocksReadOptions = opts.into(); let opt = opt.into_raw(); - Ok(RocksSstIterator(SstFileReader::iter_opt_rc( - self.inner.clone(), - opt, - ))) - } - - fn iterator_cf_opt(&self, _cf: &str, _opts: IterOptions) -> Result { - unimplemented!() // FIXME: What should happen here? + Ok(RocksSstIterator(SstFileReader::iter_opt(&self.inner, opt))) } } -// FIXME: See comment on RocksSstReader for why this contains Rc -pub struct RocksSstIterator(DBIterator>); +pub struct RocksSstIterator<'a>(DBIterator<&'a SstFileReader>); + +// It's OK to send the iterator around. +// TODO: remove this when using tirocks. +unsafe impl Send for RocksSstIterator<'_> {} -// TODO(5kbpers): Temporarily force to add `Send` here, add a method for creating -// DBIterator> in rust-rocksdb later. -unsafe impl Send for RocksSstIterator {} +impl Iterator for RocksSstIterator<'_> { + fn seek(&mut self, key: &[u8]) -> Result { + self.0.seek(rocksdb::SeekKey::Key(key)).map_err(r2e) + } + + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + self.0 + .seek_for_prev(rocksdb::SeekKey::Key(key)) + .map_err(r2e) + } -impl Iterator for RocksSstIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek(k.into_raw()).map_err(Error::Engine) + /// Seek to the first key in the database. + fn seek_to_first(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::Start).map_err(r2e) } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek_for_prev(k.into_raw()).map_err(Error::Engine) + /// Seek to the last key in the database. + fn seek_to_last(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::End).map_err(r2e) } fn prev(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.prev().map_err(Error::Engine) + self.0.prev().map_err(r2e) } fn next(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.next().map_err(Error::Engine) + self.0.next().map_err(r2e) } fn key(&self) -> &[u8] { @@ -138,7 +132,7 @@ impl Iterator for RocksSstIterator { } fn valid(&self) -> Result { - self.0.valid().map_err(Error::Engine) + self.0.valid().map_err(r2e) } } @@ -192,7 +186,7 @@ impl SstWriterBuilder for RocksSstWriterBuilder { env = db.env(); let handle = db .cf_handle(self.cf.as_deref().unwrap_or(CF_DEFAULT)) - .ok_or_else(|| format!("CF {:?} is not found", self.cf))?; + .ok_or_else(|| r2e(format!("CF {:?} is not found", self.cf)))?; db.get_options_cf(handle) } else { ColumnFamilyOptions::new() @@ -222,9 +216,15 @@ impl SstWriterBuilder for RocksSstWriterBuilder { }; // TODO: 0 is a valid value for compression_level if self.compression_level != 0 { - // other three fields are default value. - // see: https://github.com/facebook/rocksdb/blob/8cb278d11a43773a3ac22e523f4d183b06d37d88/include/rocksdb/advanced_options.h#L146-L153 - io_options.set_compression_options(-14, self.compression_level, 0, 0, 0); + // other 4 fields are default value. + io_options.set_compression_options( + -14, + self.compression_level, + 0, // strategy + 0, // max_dict_bytes + 0, // zstd_max_train_bytes + 1, // parallel_threads + ); } io_options.compression(compress_type); // in rocksdb 5.5.1, SstFileWriter will try to use bottommost_compression and @@ -234,7 +234,7 @@ impl SstWriterBuilder for RocksSstWriterBuilder { io_options.bottommost_compression(DBCompressionType::Disable); let mut writer = SstFileWriter::new(EnvOptions::new(), io_options); fail_point!("on_open_sst_writer"); - writer.open(path)?; + writer.open(path).map_err(r2e)?; Ok(RocksSstWriter { writer, env }) } } @@ -249,11 +249,11 @@ impl SstWriter for RocksSstWriter { type ExternalSstFileReader = SequentialFile; fn put(&mut self, key: &[u8], val: &[u8]) -> Result<()> { - Ok(self.writer.put(key, val)?) + self.writer.put(key, val).map_err(r2e) } fn delete(&mut self, key: &[u8]) -> Result<()> { - Ok(self.writer.delete(key)?) + self.writer.delete(key).map_err(r2e) } fn file_size(&mut self) -> u64 { @@ -261,22 +261,25 @@ impl SstWriter for RocksSstWriter { } fn finish(mut self) -> Result { - Ok(RocksExternalSstFileInfo(self.writer.finish()?)) + Ok(RocksExternalSstFileInfo(self.writer.finish().map_err(r2e)?)) } fn finish_read(mut self) -> Result<(Self::ExternalSstFileInfo, Self::ExternalSstFileReader)> { - let env = self.env.take().ok_or_else(|| { - Error::Engine("failed to read sequential file no env provided".to_owned()) - })?; - let sst_info = self.writer.finish()?; + let env = self + .env + .take() + .ok_or_else(|| r2e("failed to read sequential file no env provided"))?; + let sst_info = self.writer.finish().map_err(r2e)?; let p = sst_info.file_path(); let path = p.as_os_str().to_str().ok_or_else(|| { - Error::Engine(format!( + r2e(format!( "failed to sequential file bad path {}", p.display() )) })?; - let seq_file = env.new_sequential_file(path, EnvOptions::new())?; + let seq_file = env + .new_sequential_file(path, EnvOptions::new()) + .map_err(r2e)?; Ok((RocksExternalSstFileInfo(sst_info), seq_file)) } } @@ -373,7 +376,7 @@ mod tests { let mut writer = RocksSstWriterBuilder::new() .set_cf(CF_DEFAULT) .set_db(&engine) - .build(p.as_os_str().to_str().unwrap()) + .build(p.to_str().unwrap()) .unwrap(); writer.put(k, v).unwrap(); let sst_file = writer.finish().unwrap(); @@ -388,7 +391,7 @@ mod tests { .set_in_memory(true) .set_cf(CF_DEFAULT) .set_db(&engine) - .build(p.as_os_str().to_str().unwrap()) + .build(p.to_str().unwrap()) .unwrap(); writer.put(k, v).unwrap(); let mut buf = vec![]; diff --git a/components/engine_rocks/src/status.rs b/components/engine_rocks/src/status.rs new file mode 100644 index 00000000000..1565e013834 --- /dev/null +++ b/components/engine_rocks/src/status.rs @@ -0,0 +1,19 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +/// A function that will transform a rocksdb error to engine trait error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn r2e(msg: impl Into) -> engine_traits::Error { + // TODO: use correct code. + engine_traits::Error::Engine(engine_traits::Status::with_error( + engine_traits::Code::IoError, + msg, + )) +} + +/// A function that will transform a engine trait error to rocksdb error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn e2r(s: engine_traits::Error) -> String { + format!("{:?}", s) +} diff --git a/components/engine_rocks/src/table_properties.rs b/components/engine_rocks/src/table_properties.rs index 3a3bbad6a04..19b2141483d 100644 --- a/components/engine_rocks/src/table_properties.rs +++ b/components/engine_rocks/src/table_properties.rs @@ -1,8 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{Error, Range, Result}; +use engine_traits::{Range, Result}; -use crate::{util, RangeProperties, RocksEngine}; +use crate::{r2e, util, RangeProperties, RocksEngine}; #[repr(transparent)] pub struct UserCollectedProperties(rocksdb::UserCollectedProperties); @@ -57,11 +57,9 @@ impl RocksEngine { let cf = util::get_cf_handle(self.as_inner(), cf)?; // FIXME: extra allocation let ranges: Vec<_> = ranges.iter().map(util::range_to_rocks_range).collect(); - let raw = self - .as_inner() - .get_properties_of_tables_in_range(cf, &ranges); - let raw = raw.map_err(Error::Engine)?; - Ok(raw) + self.as_inner() + .get_properties_of_tables_in_range(cf, &ranges) + .map_err(r2e) } pub fn get_range_properties_cf( diff --git a/components/engine_rocks/src/ttl_properties.rs b/components/engine_rocks/src/ttl_properties.rs index 5dd51d8cd97..eb4641cc102 100644 --- a/components/engine_rocks/src/ttl_properties.rs +++ b/components/engine_rocks/src/ttl_properties.rs @@ -182,10 +182,10 @@ mod tests { } let case2 = [("zr\0a", 0)]; - assert!(get_properties(&case2).is_err()); + get_properties(&case2).unwrap_err(); let case3 = []; - assert!(get_properties(&case3).is_err()); + get_properties(&case3).unwrap_err(); let case4 = [("zr\0a", 1)]; let props = get_properties(&case4).unwrap(); diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 47e4016ebc6..52b1364c3ce 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -1,95 +1,172 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{str::FromStr, sync::Arc}; - -use engine_traits::{Engines, Error, Range, Result, CF_DEFAULT}; -use rocksdb::{CFHandle, Range as RocksRange, SliceTransform, DB}; -use tikv_util::box_err; +use std::{ffi::CString, fs, path::Path, str::FromStr, sync::Arc}; + +use engine_traits::{Engines, Range, Result, CF_DEFAULT}; +use rocksdb::{ + load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, CompactionFilter, + CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, + CompactionFilterValueType, DBTableFileCreationReason, Env, Range as RocksRange, SliceTransform, + DB, +}; +use slog_global::warn; use crate::{ - cf_options::RocksColumnFamilyOptions, - db_options::RocksDBOptions, - engine::RocksEngine, - raw_util::{new_engine as new_engine_raw, new_engine_opt as new_engine_opt_raw, CFOptions}, - rocks_metrics_defs::*, + cf_options::RocksCfOptions, db_options::RocksDbOptions, engine::RocksEngine, r2e, + rocks_metrics_defs::*, RocksStatistics, }; pub fn new_temp_engine(path: &tempfile::TempDir) -> Engines { let raft_path = path.path().join(std::path::Path::new("raft")); Engines::new( - new_engine( - path.path().to_str().unwrap(), - None, - engine_traits::ALL_CFS, - None, - ) - .unwrap(), - new_engine( - raft_path.to_str().unwrap(), - None, - &[engine_traits::CF_DEFAULT], - None, - ) - .unwrap(), + new_engine(path.path().to_str().unwrap(), engine_traits::ALL_CFS).unwrap(), + new_engine(raft_path.to_str().unwrap(), &[engine_traits::CF_DEFAULT]).unwrap(), ) } pub fn new_default_engine(path: &str) -> Result { - let engine = - new_engine_raw(path, None, &[CF_DEFAULT], None).map_err(|e| Error::Other(box_err!(e)))?; - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); - Ok(engine) + new_engine(path, &[CF_DEFAULT]) } -pub struct RocksCFOptions<'a> { - cf: &'a str, - options: RocksColumnFamilyOptions, +pub fn new_engine(path: &str, cfs: &[&str]) -> Result { + let mut db_opts = RocksDbOptions::default(); + db_opts.set_statistics(&RocksStatistics::new_titan()); + let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); + new_engine_opt(path, db_opts, cf_opts) } -impl<'a> RocksCFOptions<'a> { - pub fn new(cf: &'a str, options: RocksColumnFamilyOptions) -> RocksCFOptions<'a> { - RocksCFOptions { cf, options } +pub fn new_engine_opt( + path: &str, + db_opt: RocksDbOptions, + cf_opts: Vec<(&str, RocksCfOptions)>, +) -> Result { + let mut db_opt = db_opt.into_raw(); + if cf_opts.iter().all(|(name, _)| *name != CF_DEFAULT) { + return Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "default cf must be specified", + ), + )); + } + let mut cf_opts: Vec<_> = cf_opts + .into_iter() + .map(|(name, opt)| (name, opt.into_raw())) + .collect(); + + // Creates a new db if it doesn't exist. + if !db_exist(path) { + db_opt.create_if_missing(true); + db_opt.create_missing_column_families(true); + + let db = DB::open_cf(db_opt, path, cf_opts.into_iter().collect()).map_err(r2e)?; + + return Ok(RocksEngine::new(db)); + } + + db_opt.create_if_missing(false); + + // Lists all column families in current db. + let cfs_list = DB::list_column_families(&db_opt, path).map_err(r2e)?; + let existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); + let needed: Vec<&str> = cf_opts.iter().map(|(name, _)| *name).collect(); + + let cf_descs = if !existed.is_empty() { + let env = match db_opt.env() { + Some(env) => env, + None => Arc::new(Env::default()), + }; + // panic if OPTIONS not found for existing instance? + let (_, tmp) = load_latest_options(path, &env, true) + .unwrap_or_else(|e| panic!("failed to load_latest_options {:?}", e)) + .unwrap_or_else(|| panic!("couldn't find the OPTIONS file")); + tmp + } else { + vec![] + }; + + for cf in &existed { + if cf_opts.iter().all(|(name, _)| name != cf) { + cf_opts.push((cf, ColumnFamilyOptions::default())); + } + } + for (name, opt) in &mut cf_opts { + adjust_dynamic_level_bytes(&cf_descs, name, opt); + } + + let cfds: Vec<_> = cf_opts.into_iter().collect(); + // We have added all missing options by iterating `existed`. If two vecs still + // have same length, then they must have same column families dispite their + // orders. So just open db. + if needed.len() == existed.len() && needed.len() == cfds.len() { + let db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; + return Ok(RocksEngine::new(db)); } - pub fn into_raw(self) -> CFOptions<'a> { - CFOptions::new(self.cf, self.options.into_raw()) + // Opens db. + db_opt.create_missing_column_families(true); + let mut db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; + + // Drops discarded column families. + for cf in cfs_diff(&existed, &needed) { + // We have checked it at the very beginning, so it must be needed. + assert_ne!(cf, CF_DEFAULT); + db.drop_cf(cf).map_err(r2e)?; } + + Ok(RocksEngine::new(db)) } -pub fn new_engine( - path: &str, - db_opts: Option, - cfs: &[&str], - opts: Option>>, -) -> Result { - let db_opts = db_opts.map(RocksDBOptions::into_raw); - let opts = opts.map(|o| o.into_iter().map(RocksCFOptions::into_raw).collect()); - let engine = new_engine_raw(path, db_opts, cfs, opts).map_err(|e| Error::Other(box_err!(e)))?; - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); - Ok(engine) +/// Turns "dynamic level size" off for the existing column family which was off +/// before. Column families are small, HashMap isn't necessary. +fn adjust_dynamic_level_bytes( + cf_descs: &[CColumnFamilyDescriptor], + name: &str, + opt: &mut ColumnFamilyOptions, +) { + if let Some(cf_desc) = cf_descs.iter().find(|cf_desc| cf_desc.name() == name) { + let existed_dynamic_level_bytes = + cf_desc.options().get_level_compaction_dynamic_level_bytes(); + if existed_dynamic_level_bytes != opt.get_level_compaction_dynamic_level_bytes() { + warn!( + "change dynamic_level_bytes for existing column family is danger"; + "old_value" => existed_dynamic_level_bytes, + "new_value" => opt.get_level_compaction_dynamic_level_bytes(), + ); + } + opt.set_level_compaction_dynamic_level_bytes(existed_dynamic_level_bytes); + } } -pub fn new_engine_opt( - path: &str, - db_opt: RocksDBOptions, - cfs_opts: Vec>, -) -> Result { - let db_opt = db_opt.into_raw(); - let cfs_opts = cfs_opts.into_iter().map(RocksCFOptions::into_raw).collect(); - let engine = - new_engine_opt_raw(path, db_opt, cfs_opts).map_err(|e| Error::Other(box_err!(e)))?; - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); - Ok(engine) +pub fn db_exist(path: &str) -> bool { + let path = Path::new(path); + if !path.exists() || !path.is_dir() { + return false; + } + let current_file_path = path.join("CURRENT"); + if !current_file_path.exists() || !current_file_path.is_file() { + return false; + } + + // If path is not an empty directory, and current file exists, we say db exists. + // If path is not an empty directory but db has not been created, + // `DB::list_column_families` fails and we can clean up the directory by + // this indication. + fs::read_dir(path).unwrap().next().is_some() +} + +/// Returns a Vec of cf which is in `a' but not in `b'. +fn cfs_diff<'a>(a: &[&'a str], b: &[&str]) -> Vec<&'a str> { + a.iter() + .filter(|x| !b.iter().any(|y| *x == y)) + .cloned() + .collect() } pub fn get_cf_handle<'a>(db: &'a DB, cf: &str) -> Result<&'a CFHandle> { - let handle = db - .cf_handle(cf) - .ok_or_else(|| Error::Engine(format!("cf {} not found", cf)))?; - Ok(handle) + db.cf_handle(cf) + .ok_or_else(|| format!("cf {} not found", cf)) + .map_err(r2e) } pub fn range_to_rocks_range<'a>(range: &Range<'a>) -> RocksRange<'a> { @@ -223,3 +300,312 @@ impl SliceTransform for NoopSliceTransform { true } } + +pub fn to_raw_perf_level(level: engine_traits::PerfLevel) -> rocksdb::PerfLevel { + match level { + engine_traits::PerfLevel::Uninitialized => rocksdb::PerfLevel::Uninitialized, + engine_traits::PerfLevel::Disable => rocksdb::PerfLevel::Disable, + engine_traits::PerfLevel::EnableCount => rocksdb::PerfLevel::EnableCount, + engine_traits::PerfLevel::EnableTimeExceptForMutex => { + rocksdb::PerfLevel::EnableTimeExceptForMutex + } + engine_traits::PerfLevel::EnableTimeAndCpuTimeExceptForMutex => { + rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex + } + engine_traits::PerfLevel::EnableTime => rocksdb::PerfLevel::EnableTime, + engine_traits::PerfLevel::OutOfBounds => rocksdb::PerfLevel::OutOfBounds, + } +} + +pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLevel { + match level { + rocksdb::PerfLevel::Uninitialized => engine_traits::PerfLevel::Uninitialized, + rocksdb::PerfLevel::Disable => engine_traits::PerfLevel::Disable, + rocksdb::PerfLevel::EnableCount => engine_traits::PerfLevel::EnableCount, + rocksdb::PerfLevel::EnableTimeExceptForMutex => { + engine_traits::PerfLevel::EnableTimeExceptForMutex + } + rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { + engine_traits::PerfLevel::EnableTimeAndCpuTimeExceptForMutex + } + rocksdb::PerfLevel::EnableTime => engine_traits::PerfLevel::EnableTime, + rocksdb::PerfLevel::OutOfBounds => engine_traits::PerfLevel::OutOfBounds, + } +} + +struct OwnedRange { + start_key: Box<[u8]>, + end_key: Box<[u8]>, +} + +type FilterByReason = [bool; 4]; + +fn reason_to_index(reason: DBTableFileCreationReason) -> usize { + match reason { + DBTableFileCreationReason::Flush => 0, + DBTableFileCreationReason::Compaction => 1, + DBTableFileCreationReason::Recovery => 2, + DBTableFileCreationReason::Misc => 3, + } +} + +fn filter_by_reason(factory: &impl CompactionFilterFactory) -> FilterByReason { + let mut r = FilterByReason::default(); + r[reason_to_index(DBTableFileCreationReason::Flush)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Flush); + r[reason_to_index(DBTableFileCreationReason::Compaction)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Compaction); + r[reason_to_index(DBTableFileCreationReason::Recovery)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Recovery); + r[reason_to_index(DBTableFileCreationReason::Misc)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Misc); + r +} + +pub struct StackingCompactionFilterFactory { + outer_should_filter: FilterByReason, + outer: A, + inner_should_filter: FilterByReason, + inner: B, +} + +impl StackingCompactionFilterFactory { + /// Creates a factory of stacked filter with `outer` on top of `inner`. + /// Table keys will be filtered through `outer` first before reaching + /// `inner`. + pub fn new(outer: A, inner: B) -> Self { + let outer_should_filter = filter_by_reason(&outer); + let inner_should_filter = filter_by_reason(&inner); + Self { + outer_should_filter, + outer, + inner_should_filter, + inner, + } + } +} + +impl CompactionFilterFactory + for StackingCompactionFilterFactory +{ + type Filter = StackingCompactionFilter; + + fn create_compaction_filter( + &self, + context: &CompactionFilterContext, + ) -> Option<(CString, Self::Filter)> { + let i = reason_to_index(context.reason()); + let mut outer_filter = None; + let mut inner_filter = None; + let mut full_name = String::new(); + if self.outer_should_filter[i] + && let Some((name, filter)) = self.outer.create_compaction_filter(context) + { + outer_filter = Some(filter); + full_name = name.into_string().unwrap(); + } + if self.inner_should_filter[i] + && let Some((name, filter)) = self.inner.create_compaction_filter(context) + { + inner_filter = Some(filter); + if !full_name.is_empty() { + full_name += "."; + } + full_name += name.to_str().unwrap(); + } + if outer_filter.is_none() && inner_filter.is_none() { + None + } else { + let filter = StackingCompactionFilter { + outer: outer_filter, + inner: inner_filter, + }; + Some((CString::new(full_name).unwrap(), filter)) + } + } + + fn should_filter_table_file_creation(&self, reason: DBTableFileCreationReason) -> bool { + let i = reason_to_index(reason); + self.outer_should_filter[i] || self.inner_should_filter[i] + } +} + +pub struct StackingCompactionFilter { + outer: Option, + inner: Option, +} + +impl CompactionFilter for StackingCompactionFilter { + fn featured_filter( + &mut self, + level: usize, + key: &[u8], + seqno: u64, + value: &[u8], + value_type: CompactionFilterValueType, + ) -> CompactionFilterDecision { + if let Some(outer) = self.outer.as_mut() + && let r = outer.featured_filter(level, key, seqno, value, value_type) + && !matches!(r, CompactionFilterDecision::Keep) + { + r + } else if let Some(inner) = self.inner.as_mut() { + inner.featured_filter(level, key, seqno, value, value_type) + } else { + CompactionFilterDecision::Keep + } + } +} + +#[derive(Clone)] +pub struct RangeCompactionFilterFactory(Arc); + +impl RangeCompactionFilterFactory { + pub fn new(start_key: Box<[u8]>, end_key: Box<[u8]>) -> Self { + let range = OwnedRange { start_key, end_key }; + Self(Arc::new(range)) + } +} + +impl CompactionFilterFactory for RangeCompactionFilterFactory { + type Filter = RangeCompactionFilter; + + fn create_compaction_filter( + &self, + _context: &CompactionFilterContext, + ) -> Option<(CString, Self::Filter)> { + Some(( + CString::new("range_filter").unwrap(), + RangeCompactionFilter(self.0.clone()), + )) + } + + fn should_filter_table_file_creation(&self, _reason: DBTableFileCreationReason) -> bool { + true + } +} + +/// Filters out all keys outside the key range. +pub struct RangeCompactionFilter(Arc); + +impl CompactionFilter for RangeCompactionFilter { + fn featured_filter( + &mut self, + _level: usize, + key: &[u8], + _seqno: u64, + _value: &[u8], + _value_type: CompactionFilterValueType, + ) -> CompactionFilterDecision { + if key < self.0.start_key.as_ref() || key >= self.0.end_key.as_ref() { + CompactionFilterDecision::Remove + } else { + CompactionFilterDecision::Keep + } + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{CfOptionsExt, Peekable, SyncMutable, CF_DEFAULT}; + use rocksdb::DB; + use tempfile::Builder; + + use super::*; + + #[test] + fn test_cfs_diff() { + let a = vec!["1", "2", "3"]; + let a_diff_a = cfs_diff(&a, &a); + assert!(a_diff_a.is_empty()); + let b = vec!["4"]; + assert_eq!(a, cfs_diff(&a, &b)); + let c = vec!["4", "5", "3", "6"]; + assert_eq!(vec!["1", "2"], cfs_diff(&a, &c)); + assert_eq!(vec!["4", "5", "6"], cfs_diff(&c, &a)); + let d = vec!["1", "2", "3", "4"]; + let a_diff_d = cfs_diff(&a, &d); + assert!(a_diff_d.is_empty()); + assert_eq!(vec!["4"], cfs_diff(&d, &a)); + } + + #[test] + fn test_new_engine_opt() { + let path = Builder::new() + .prefix("_util_rocksdb_test_check_column_families") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + // create db when db not exist + let mut cfs_opts = vec![(CF_DEFAULT, RocksCfOptions::default())]; + let mut opts = RocksCfOptions::default(); + opts.set_level_compaction_dynamic_level_bytes(true); + cfs_opts.push(("cf_dynamic_level_bytes", opts.clone())); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); + check_dynamic_level_bytes(&db); + drop(db); + + // add cf1. + let cfs_opts = vec![ + (CF_DEFAULT, opts.clone()), + ("cf_dynamic_level_bytes", opts.clone()), + ("cf1", opts.clone()), + ]; + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + check_dynamic_level_bytes(&db); + for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + db.put_cf(cf, b"k", b"v").unwrap(); + } + drop(db); + + // change order should not cause data corruption. + let cfs_opts = vec![ + ("cf_dynamic_level_bytes", opts.clone()), + ("cf1", opts.clone()), + (CF_DEFAULT, opts), + ]; + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + check_dynamic_level_bytes(&db); + for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + assert_eq!(db.get_value_cf(cf, b"k").unwrap().unwrap(), b"v"); + } + drop(db); + + // drop cf1. + let cfs = vec![CF_DEFAULT, "cf_dynamic_level_bytes"]; + let db = new_engine(path_str, &cfs).unwrap(); + column_families_must_eq(path_str, cfs); + check_dynamic_level_bytes(&db); + drop(db); + + // drop all cfs. + new_engine(path_str, &[CF_DEFAULT]).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT]); + + // not specifying default cf should error. + new_engine(path_str, &[]).unwrap_err(); + column_families_must_eq(path_str, vec![CF_DEFAULT]); + } + + fn column_families_must_eq(path: &str, excepted: Vec<&str>) { + let opts = RocksDbOptions::default(); + let cfs_list = DB::list_column_families(&opts, path).unwrap(); + + let mut cfs_existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); + let mut cfs_excepted: Vec<&str> = excepted.clone(); + cfs_existed.sort_unstable(); + cfs_excepted.sort_unstable(); + assert_eq!(cfs_existed, cfs_excepted); + } + + fn check_dynamic_level_bytes(db: &RocksEngine) { + let tmp_cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); + assert!(!tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); + let tmp_cf_opts = db.get_options_cf("cf_dynamic_level_bytes").unwrap(); + assert!(tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); + } +} diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 824882cc1e9..3659a7628d6 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -2,163 +2,276 @@ use std::sync::Arc; -use engine_traits::{self, Error, Mutable, Result, WriteBatchExt, WriteOptions}; +use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; -use crate::{engine::RocksEngine, options::RocksWriteOptions, util::get_cf_handle}; +use crate::{engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle}; + +const WRITE_BATCH_MAX_BATCH_NUM: usize = 16; +const WRITE_BATCH_MAX_KEY_NUM: usize = 16; impl WriteBatchExt for RocksEngine { - type WriteBatch = RocksWriteBatch; + type WriteBatch = RocksWriteBatchVec; const WRITE_BATCH_MAX_KEYS: usize = 256; - fn write_batch(&self) -> RocksWriteBatch { - RocksWriteBatch::new(self.as_inner().clone()) + fn write_batch(&self) -> RocksWriteBatchVec { + RocksWriteBatchVec::new( + Arc::clone(self.as_inner()), + WRITE_BATCH_MAX_KEY_NUM, + 1, + self.support_multi_batch_write(), + ) } - fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatch { - RocksWriteBatch::with_capacity(self, cap) + fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { + RocksWriteBatchVec::with_unit_capacity(self, cap) } } -pub struct RocksWriteBatch { +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which +/// splits a large WriteBatch into many smaller ones and then any thread could +/// help to deal with these small WriteBatch when it is calling +/// `MultiBatchCommit` and wait the front writer to finish writing. +/// `MultiBatchWrite` will perform much better than traditional +/// `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more +/// stable and becomes compatible with Titan. +pub struct RocksWriteBatchVec { db: Arc, - wb: RawWriteBatch, + wbs: Vec, + save_points: Vec, + index: usize, + batch_size_limit: usize, + support_write_batch_vec: bool, } -impl RocksWriteBatch { - pub fn new(db: Arc) -> RocksWriteBatch { - let wb = RawWriteBatch::new(); - RocksWriteBatch { db, wb } - } - - pub fn with_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatch { +impl RocksWriteBatchVec { + pub fn new( + db: Arc, + batch_size_limit: usize, + cap: usize, + support_write_batch_vec: bool, + ) -> RocksWriteBatchVec { let wb = RawWriteBatch::with_capacity(cap); - RocksWriteBatch { - db: engine.as_inner().clone(), - wb, + RocksWriteBatchVec { + db, + wbs: vec![wb], + save_points: vec![], + index: 0, + batch_size_limit, + support_write_batch_vec, } } - pub fn as_inner(&self) -> &RawWriteBatch { - &self.wb + pub fn with_unit_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatchVec { + Self::new( + engine.as_inner().clone(), + WRITE_BATCH_MAX_KEY_NUM, + cap, + engine.support_multi_batch_write(), + ) } - pub fn as_raw(&self) -> &RawWriteBatch { - &self.wb + pub fn as_inner(&self) -> &[RawWriteBatch] { + &self.wbs[0..=self.index] } pub fn get_db(&self) -> &DB { self.db.as_ref() } -} -impl engine_traits::WriteBatch for RocksWriteBatch { - fn write_opt(&self, opts: &WriteOptions) -> Result<()> { + /// `check_switch_batch` will split a large WriteBatch into many smaller + /// ones. This is to avoid a large WriteBatch blocking write_thread too + /// long. + #[inline(always)] + fn check_switch_batch(&mut self) { + if self.support_write_batch_vec + && self.batch_size_limit > 0 + && self.wbs[self.index].count() >= self.batch_size_limit + { + self.index += 1; + if self.index >= self.wbs.len() { + self.wbs.push(RawWriteBatch::default()); + } + } + } + + #[inline] + fn write_impl(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { let opt: RocksWriteOptions = opts.into(); - self.get_db() - .write_opt(&self.wb, &opt.into_raw()) - .map_err(Error::Engine) + let mut seq = 0; + if self.support_write_batch_vec { + // FIXME(tabokie): Callback for empty write batch won't be called. + self.get_db() + .multi_batch_write_callback(self.as_inner(), &opt.into_raw(), |s| { + seq = s; + cb(); + }) + .map_err(r2e)?; + } else { + self.get_db() + .write_callback(&self.wbs[0], &opt.into_raw(), |s| { + seq = s; + cb(); + }) + .map_err(r2e)?; + } + Ok(seq) + } +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + self.write_impl(opts, || {}) + } + + fn write_callback_opt(&mut self, opts: &WriteOptions, cb: impl FnMut()) -> Result { + self.write_impl(opts, cb) } fn data_size(&self) -> usize { - self.wb.data_size() + let mut size: usize = 0; + for i in 0..=self.index { + size += self.wbs[i].data_size(); + } + size } fn count(&self) -> usize { - self.wb.count() + self.wbs[self.index].count() + self.index * self.batch_size_limit } fn is_empty(&self) -> bool { - self.wb.is_empty() + self.wbs[0].is_empty() } fn should_write_to_engine(&self) -> bool { - self.count() > RocksEngine::WRITE_BATCH_MAX_KEYS + if self.support_write_batch_vec { + self.index >= WRITE_BATCH_MAX_BATCH_NUM + } else { + self.wbs[0].count() > RocksEngine::WRITE_BATCH_MAX_KEYS + } } fn clear(&mut self) { - self.wb.clear(); + for i in 0..=self.index { + self.wbs[i].clear(); + } + self.save_points.clear(); + // Avoid making the wbs too big at one time, then the memory will be kept + // after reusing + if self.index > WRITE_BATCH_MAX_BATCH_NUM + 1 { + self.wbs.shrink_to(WRITE_BATCH_MAX_BATCH_NUM + 1); + } + self.index = 0; } fn set_save_point(&mut self) { - self.wb.set_save_point(); + self.wbs[self.index].set_save_point(); + self.save_points.push(self.index); } fn pop_save_point(&mut self) -> Result<()> { - self.wb.pop_save_point().map_err(Error::Engine) + if let Some(x) = self.save_points.pop() { + return self.wbs[x].pop_save_point().map_err(r2e); + } + Err(r2e("no save point")) } fn rollback_to_save_point(&mut self) -> Result<()> { - self.wb.rollback_to_save_point().map_err(Error::Engine) + if let Some(x) = self.save_points.pop() { + for i in x + 1..=self.index { + self.wbs[i].clear(); + } + self.index = x; + return self.wbs[x].rollback_to_save_point().map_err(r2e); + } + Err(r2e("no save point")) } fn merge(&mut self, other: Self) -> Result<()> { - self.wb.append(other.wb.data()); + for wb in other.as_inner() { + self.check_switch_batch(); + self.wbs[self.index].append(wb.data()); + } Ok(()) } } -impl Mutable for RocksWriteBatch { +impl Mutable for RocksWriteBatchVec { fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { - self.wb.put(key, value).map_err(Error::Engine) + self.check_switch_batch(); + self.wbs[self.index].put(key, value).map_err(r2e) } fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wb.put_cf(handle, key, value).map_err(Error::Engine) + self.wbs[self.index].put_cf(handle, key, value).map_err(r2e) } fn delete(&mut self, key: &[u8]) -> Result<()> { - self.wb.delete(key).map_err(Error::Engine) + self.check_switch_batch(); + self.wbs[self.index].delete(key).map_err(r2e) } fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wb.delete_cf(handle, key).map_err(Error::Engine) + self.wbs[self.index].delete_cf(handle, key).map_err(r2e) } fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - self.wb + self.check_switch_batch(); + self.wbs[self.index] .delete_range(begin_key, end_key) - .map_err(Error::Engine) + .map_err(r2e) } fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wb + self.wbs[self.index] .delete_range_cf(handle, begin_key, end_key) - .map_err(Error::Engine) + .map_err(r2e) } } #[cfg(test)] mod tests { - use engine_traits::{Peekable, WriteBatch}; + use engine_traits::{Peekable, WriteBatch, CF_DEFAULT}; use rocksdb::DBOptions as RawDBOptions; use tempfile::Builder; use super::{ - super::{util::new_engine_opt, RocksDBOptions}, + super::{util::new_engine_opt, RocksDbOptions}, *, }; + use crate::RocksCfOptions; #[test] - fn test_should_write_to_engine() { + fn test_should_write_to_engine_with_pipeline_write_mode() { let path = Builder::new() .prefix("test-should-write-to-engine") .tempdir() .unwrap(); let opt = RawDBOptions::default(); opt.enable_unordered_write(false); - opt.enable_pipelined_write(false); - opt.enable_pipelined_commit(true); + opt.enable_pipelined_write(true); + opt.enable_multi_batch_write(false); let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), - RocksDBOptions::from_raw(opt), - vec![], + RocksDbOptions::from_raw(opt), + vec![(CF_DEFAULT, RocksCfOptions::default())], ) .unwrap(); + assert!( + !engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); let mut wb = engine.write_batch(); for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { wb.put(b"aaa", b"bbb").unwrap(); @@ -167,10 +280,12 @@ mod tests { wb.put(b"aaa", b"bbb").unwrap(); assert!(wb.should_write_to_engine()); wb.write().unwrap(); + let v = engine.get_value(b"aaa").unwrap(); + assert!(v.is_some()); assert_eq!(v.unwrap(), b"bbb"); - let mut wb = RocksWriteBatch::with_capacity(&engine, 1024); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { wb.put(b"aaa", b"bbb").unwrap(); } @@ -180,4 +295,44 @@ mod tests { wb.clear(); assert!(!wb.should_write_to_engine()); } + + #[test] + fn test_should_write_to_engine_with_multi_batch_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let opt = RawDBOptions::default(); + opt.enable_unordered_write(false); + opt.enable_pipelined_write(false); + opt.enable_multi_batch_write(true); + let engine = new_engine_opt( + path.path().join("db").to_str().unwrap(), + RocksDbOptions::from_raw(opt), + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert!( + engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); + let mut wb = engine.write_batch(); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..WRITE_BATCH_MAX_BATCH_NUM * WRITE_BATCH_MAX_KEY_NUM { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } } diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index 74a0e8de47c..b8847fa6ba8 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -8,20 +8,21 @@ publish = false failpoints = ["fail/failpoints"] [dependencies] -engine_rocks = { path = "../engine_rocks", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" futures = "0.3" -keys = { path = "../keys", default-features = false } +keys = { workspace = true } lazy_static = "1.4.0" -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = "2.8" -raftstore = { path = "../raftstore", default-features = false } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +raftstore = { workspace = true } +slog = { workspace = true } +slog-global = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] -engine_test = { path = "../engine_test" } -kvproto = { git = "https://github.com/pingcap/kvproto.git", default-features = false } +engine_test = { workspace = true } +kvproto = { workspace = true } tempfile = "3.0" diff --git a/components/engine_rocks_helper/src/sst_recovery.rs b/components/engine_rocks_helper/src/sst_recovery.rs index e7c1bae3a1c..85fb8d74bee 100644 --- a/components/engine_rocks_helper/src/sst_recovery.rs +++ b/components/engine_rocks_helper/src/sst_recovery.rs @@ -6,7 +6,7 @@ use std::{ time::{Duration, Instant}, }; -use engine_rocks::raw::*; +use engine_rocks::RocksEngine; use fail::fail_point; use raftstore::store::fsm::StoreMeta; use tikv_util::{self, set_panic_mark, warn, worker::*}; @@ -17,7 +17,7 @@ pub const DEFAULT_CHECK_INTERVAL: Duration = Duration::from_secs(10); const MAX_DAMAGED_FILES_NUM: usize = 2; pub struct RecoveryRunner { - db: Arc, + db: RocksEngine, store_meta: Arc>, // Considering that files will not be too much, it is enough to use `Vec`. damaged_files: Vec, @@ -68,7 +68,7 @@ impl RunnableWithTimer for RecoveryRunner { impl RecoveryRunner { pub fn new( - db: Arc, + db: RocksEngine, store_meta: Arc>, max_hang_duration: Duration, check_duration: Duration, @@ -87,7 +87,7 @@ impl RecoveryRunner { return; } - let live_files = self.db.get_live_files(); + let live_files = self.db.as_inner().get_live_files(); for i in 0..live_files.get_files_count() { if path == live_files.get_name(i as i32) { let f = FileInfo { @@ -132,7 +132,8 @@ impl RecoveryRunner { self.damaged_files.iter().any(|f| f.name == sst_path) } - // Cleans up obsolete damaged files and panics if some files are not handled in time. + // Cleans up obsolete damaged files and panics if some files are not handled in + // time. fn check_damaged_files(&mut self) { if self.damaged_files.is_empty() { return; @@ -153,7 +154,8 @@ impl RecoveryRunner { } // Check whether the StoreMeta contains the region range, if it contains, - // recorded fault region ids to report to PD and add file info into `damaged_files`. + // recorded fault region ids to report to PD and add file info into + // `damaged_files`. // // Acquire meta lock. fn check_overlap_damaged_regions(&self, file: &FileInfo) -> bool { @@ -163,10 +165,11 @@ impl RecoveryRunner { meta.update_overlap_damaged_ranges(&file.name, &file.smallest_key, &file.largest_key); if !overlap { fail_point!("sst_recovery_before_delete_files"); - // The sst file can be deleted safely and set `include_end` to `true` otherwise the - // file with the same largest key will be skipped. + // The sst file can be deleted safely and set `include_end` to `true` otherwise + // the file with the same largest key will be skipped. // Here store meta lock should be held to prevent peers from being added back. self.db + .as_inner() .delete_files_in_range(&file.smallest_key, &file.largest_key, true) .unwrap(); self.must_file_not_exist(&file.name); @@ -192,7 +195,7 @@ impl RecoveryRunner { } fn must_file_not_exist(&self, fname: &str) { - let live_files = self.db.get_live_files(); + let live_files = self.db.as_inner().get_live_files(); for i in 0..live_files.get_files_count() { if live_files.get_name(i as i32) == fname { // `delete_files_in_range` can't delete L0 files. @@ -206,7 +209,8 @@ impl RecoveryRunner { mod tests { use std::{collections::BTreeMap, sync::Arc}; - use engine_rocks::raw_util; + use engine_rocks::util; + use engine_traits::{CompactExt, SyncMutable, CF_DEFAULT}; use kvproto::metapb::{Peer, Region}; use tempfile::Builder; @@ -218,16 +222,15 @@ mod tests { .prefix("test_sst_recovery_runner") .tempdir() .unwrap(); - let db = Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &["cf"], None).unwrap(), - ); + let db = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, "cf"]).unwrap(); db.put(b"z2", b"val").unwrap(); db.put(b"z7", b"val").unwrap(); // generate SST file. - db.compact_range(None, None); + db.compact_range_cf(CF_DEFAULT, None, None, false, 1) + .unwrap(); - let files = db.get_live_files(); + let files = db.as_inner().get_live_files(); assert_eq!(files.get_smallestkey(0), b"z2"); assert_eq!(files.get_largestkey(0), b"z7"); diff --git a/components/engine_test/Cargo.toml b/components/engine_test/Cargo.toml index 61061957563..16e538acc51 100644 --- a/components/engine_test/Cargo.toml +++ b/components/engine_test/Cargo.toml @@ -24,13 +24,14 @@ test-engines-panic = [ ] [dependencies] -encryption = { path = "../encryption", default-features = false } -engine_panic = { path = "../engine_panic", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -file_system = { path = "../file_system", default-features = false } -raft_log_engine = { path = "../raft_log_engine", default-features = false } +collections = { workspace = true } +encryption = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +file_system = { workspace = true } +raft_log_engine = { workspace = true } tempfile = "3.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } # FIXME: Remove this dep from the engine_traits interface -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 4d804a17a9f..932a1bcb51a 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -55,6 +55,8 @@ //! storage engines, and that it be extracted into its own crate for use in //! TiKV, once the full requirements are better understood. +#![feature(let_chains)] + /// Types and constructors for the "raft" engine pub mod raft { #[cfg(feature = "test-engine-raft-panic")] @@ -65,15 +67,17 @@ pub mod raft { #[cfg(feature = "test-engine-raft-raft-engine")] pub use raft_log_engine::RaftLogEngine as RaftTestEngine; - use crate::ctor::{RaftDBOptions, RaftEngineConstructorExt}; + use crate::ctor::{RaftDbOptions, RaftEngineConstructorExt}; - pub fn new_engine(path: &str, db_opt: Option) -> Result { + pub fn new_engine(path: &str, db_opt: Option) -> Result { RaftTestEngine::new_raft_engine(path, db_opt) } } /// Types and constructors for the "kv" engine pub mod kv { + use std::path::Path; + #[cfg(feature = "test-engine-kv-panic")] pub use engine_panic::{ PanicEngine as KvTestEngine, PanicEngineIterator as KvTestEngineIterator, @@ -82,28 +86,63 @@ pub mod kv { #[cfg(feature = "test-engine-kv-rocksdb")] pub use engine_rocks::{ RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, - RocksSnapshot as KvTestSnapshot, RocksWriteBatch as KvTestWriteBatch, + RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; - use engine_traits::Result; + use engine_traits::{MiscExt, Result, TabletContext, TabletFactory}; - use crate::ctor::{CFOptions, DBOptions, KvEngineConstructorExt}; + use crate::ctor::{CfOptions as KvTestCfOptions, DbOptions, KvEngineConstructorExt}; - pub fn new_engine( - path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>>, - ) -> Result { - KvTestEngine::new_kv_engine(path, db_opt, cfs, opts) + pub fn new_engine(path: &str, cfs: &[&str]) -> Result { + KvTestEngine::new_kv_engine(path, cfs) } pub fn new_engine_opt( path: &str, - db_opt: DBOptions, - cfs_opts: Vec>, + db_opt: DbOptions, + cfs_opts: Vec<(&str, KvTestCfOptions)>, ) -> Result { KvTestEngine::new_kv_engine_opt(path, db_opt, cfs_opts) } + + const TOMBSTONE_SUFFIX: &str = ".tombstone"; + + #[derive(Clone)] + pub struct TestTabletFactory { + db_opt: DbOptions, + cf_opts: Vec<(&'static str, KvTestCfOptions)>, + } + + impl TestTabletFactory { + pub fn new(db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>) -> Self { + Self { db_opt, cf_opts } + } + } + + impl TabletFactory for TestTabletFactory { + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { + KvTestEngine::new_tablet( + path.to_str().unwrap(), + ctx, + self.db_opt.clone(), + self.cf_opts.clone(), + ) + } + + fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { + let tombstone_path = path.with_extension(TOMBSTONE_SUFFIX); + let _ = std::fs::remove_dir_all(&tombstone_path); + std::fs::rename(path, &tombstone_path)?; + if let Some(m) = &self.db_opt.key_manager { + m.remove_dir(path, Some(&tombstone_path))?; + } + std::fs::remove_dir_all(tombstone_path)?; + Ok(()) + } + + fn exists(&self, path: &Path) -> bool { + KvTestEngine::exists(path.to_str().unwrap_or_default()) + } + } } /// Create a storage engine with a concrete type. This should ultimately be the @@ -120,8 +159,8 @@ pub mod ctor { use std::sync::Arc; use encryption::DataKeyManager; - use engine_traits::Result; - use file_system::IORateLimiter; + use engine_traits::{Result, StateStorage, TabletContext}; + use file_system::IoRateLimiter; /// Kv engine construction /// @@ -137,16 +176,12 @@ pub mod ctor { /// - The column families specified as `cfs`, with default options, or /// - The column families specified as `opts`, with options. /// - /// Note that if `opts` is not `None` then the `cfs` argument is completely ignored. + /// Note that if `opts` is not `None` then the `cfs` argument is + /// completely ignored. /// /// The engine stores its data in the `path` directory. /// If that directory does not exist, then it is created. - fn new_kv_engine( - path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>>, - ) -> Result; + fn new_kv_engine(path: &str, cfs: &[&str]) -> Result; /// Create a new engine with specified column families and options /// @@ -154,46 +189,53 @@ pub mod ctor { /// If that directory does not exist, then it is created. fn new_kv_engine_opt( path: &str, - db_opt: DBOptions, - cfs_opts: Vec>, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, + ) -> Result; + + /// Create a new engine specific for multi rocks. + fn new_tablet( + path: &str, + ctx: TabletContext, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, ) -> Result; } /// Raft engine construction pub trait RaftEngineConstructorExt: Sized { /// Create a new raft engine. - fn new_raft_engine(path: &str, db_opt: Option) -> Result; + fn new_raft_engine(path: &str, db_opt: Option) -> Result; } #[derive(Clone, Default)] - pub struct DBOptions { - key_manager: Option>, - rate_limiter: Option>, + pub struct DbOptions { + pub(crate) key_manager: Option>, + rate_limiter: Option>, + state_storage: Option>, + enable_multi_batch_write: bool, } - impl DBOptions { + impl DbOptions { pub fn set_key_manager(&mut self, key_manager: Option>) { self.key_manager = key_manager; } - pub fn set_rate_limiter(&mut self, rate_limiter: Option>) { + pub fn set_rate_limiter(&mut self, rate_limiter: Option>) { self.rate_limiter = rate_limiter; } - } - pub type RaftDBOptions = DBOptions; - - pub struct CFOptions<'a> { - pub cf: &'a str, - pub options: ColumnFamilyOptions, - } + pub fn set_state_storage(&mut self, state_storage: Arc) { + self.state_storage = Some(state_storage); + } - impl<'a> CFOptions<'a> { - pub fn new(cf: &'a str, options: ColumnFamilyOptions) -> CFOptions<'a> { - CFOptions { cf, options } + pub fn set_enable_multi_batch_write(&mut self, enable: bool) { + self.enable_multi_batch_write = enable; } } + pub type RaftDbOptions = DbOptions; + /// Properties for a single column family /// /// All engines must emulate column families, but at present it is not clear @@ -216,7 +258,7 @@ pub mod ctor { /// In the future TiKV will probably have engine-specific configuration /// options. #[derive(Clone)] - pub struct ColumnFamilyOptions { + pub struct CfOptions { disable_auto_compactions: bool, level_zero_file_num_compaction_trigger: Option, level_zero_slowdown_writes_trigger: Option, @@ -228,9 +270,9 @@ pub mod ctor { no_table_properties: bool, } - impl ColumnFamilyOptions { - pub fn new() -> ColumnFamilyOptions { - ColumnFamilyOptions { + impl CfOptions { + pub fn new() -> CfOptions { + CfOptions { disable_auto_compactions: false, level_zero_file_num_compaction_trigger: None, level_zero_slowdown_writes_trigger: None, @@ -280,7 +322,7 @@ pub mod ctor { } } - impl Default for ColumnFamilyOptions { + impl Default for CfOptions { fn default() -> Self { Self::new() } @@ -290,29 +332,33 @@ pub mod ctor { use engine_panic::PanicEngine; use engine_traits::Result; - use super::{CFOptions, DBOptions, KvEngineConstructorExt, RaftEngineConstructorExt}; + use super::{CfOptions, DbOptions, KvEngineConstructorExt, RaftEngineConstructorExt}; impl KvEngineConstructorExt for engine_panic::PanicEngine { - fn new_kv_engine( + fn new_kv_engine(_path: &str, _cfs: &[&str]) -> Result { + Ok(PanicEngine) + } + + fn new_kv_engine_opt( _path: &str, - _db_opt: Option, - _cfs: &[&str], - _opts: Option>>, + _db_opt: DbOptions, + _cfs_opts: Vec<(&str, CfOptions)>, ) -> Result { Ok(PanicEngine) } - fn new_kv_engine_opt( + fn new_tablet( _path: &str, - _db_opt: DBOptions, - _cfs_opts: Vec>, + _ctx: engine_traits::TabletContext, + _db_opt: DbOptions, + _cf_opts: Vec<(&str, CfOptions)>, ) -> Result { Ok(PanicEngine) } } impl RaftEngineConstructorExt for engine_panic::PanicEngine { - fn new_raft_engine(_path: &str, _db_opt: Option) -> Result { + fn new_raft_engine(_path: &str, _db_opt: Option) -> Result { Ok(PanicEngine) } } @@ -322,70 +368,76 @@ pub mod ctor { use engine_rocks::{ get_env, properties::{MvccPropertiesCollectorFactory, RangePropertiesCollectorFactory}, - raw::{ - ColumnFamilyOptions as RawRocksColumnFamilyOptions, DBOptions as RawRocksDBOptions, - }, - util::{ - new_engine as rocks_new_engine, new_engine_opt as rocks_new_engine_opt, - RocksCFOptions, - }, - RocksColumnFamilyOptions, RocksDBOptions, + util::{new_engine_opt as rocks_new_engine_opt, RangeCompactionFilterFactory}, + RocksCfOptions, RocksDbOptions, RocksPersistenceListener, + }; + use engine_traits::{ + CfOptions as _, PersistenceListener, Result, TabletContext, CF_DEFAULT, }; - use engine_traits::{ColumnFamilyOptions as ColumnFamilyOptionsTrait, Result}; use super::{ - CFOptions, ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions, - RaftEngineConstructorExt, + CfOptions, DbOptions, KvEngineConstructorExt, RaftDbOptions, RaftEngineConstructorExt, }; impl KvEngineConstructorExt for engine_rocks::RocksEngine { - // FIXME this is duplicating behavior from engine_rocks::raw_util in order to + // FIXME this is duplicating behavior from engine_rocks::util in order to // call set_standard_cf_opts. - fn new_kv_engine( - path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>>, - ) -> Result { - let rocks_db_opts = match db_opt { - Some(db_opt) => Some(get_rocks_db_opts(db_opt)?), - None => None, - }; - let cfs_opts = match opts { - Some(opts) => opts, - None => { - let mut default_cfs_opts = Vec::with_capacity(cfs.len()); - for cf in cfs { - default_cfs_opts.push(CFOptions::new(*cf, ColumnFamilyOptions::new())); - } - default_cfs_opts - } - }; - let rocks_cfs_opts = cfs_opts + fn new_kv_engine(path: &str, cfs: &[&str]) -> Result { + let rocks_db_opt = RocksDbOptions::default(); + let default_cf_opt = CfOptions::new(); + let rocks_cfs_opts = cfs .iter() - .map(|cf_opts| { - let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); - set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); - set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - RocksCFOptions::new(cf_opts.cf, rocks_cf_opts) - }) + .map(|cf_name| (*cf_name, get_rocks_cf_opts(&default_cf_opt))) .collect(); - rocks_new_engine(path, rocks_db_opts, &[], Some(rocks_cfs_opts)) + rocks_new_engine_opt(path, rocks_db_opt, rocks_cfs_opts) } fn new_kv_engine_opt( path: &str, - db_opt: DBOptions, - cfs_opts: Vec>, + db_opt: DbOptions, + cfs_opts: Vec<(&str, CfOptions)>, ) -> Result { let rocks_db_opts = get_rocks_db_opts(db_opt)?; let rocks_cfs_opts = cfs_opts .iter() - .map(|cf_opts| { - let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); - set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); - set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - RocksCFOptions::new(cf_opts.cf, rocks_cf_opts) + .map(|(name, opt)| (*name, get_rocks_cf_opts(opt))) + .collect(); + rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) + } + + fn new_tablet( + path: &str, + ctx: TabletContext, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, + ) -> Result { + let mut rocks_db_opts = RocksDbOptions::default(); + let env = get_env(db_opt.key_manager.clone(), db_opt.rate_limiter)?; + rocks_db_opts.set_env(env); + rocks_db_opts.enable_unordered_write(false); + rocks_db_opts.enable_pipelined_write(false); + rocks_db_opts.enable_multi_batch_write(false); + rocks_db_opts.allow_concurrent_memtable_write(false); + if let Some(storage) = db_opt.state_storage + && let Some(flush_state) = ctx.flush_state { + let listener = PersistenceListener::new( + ctx.id, + ctx.suffix.unwrap(), + flush_state, + storage, + ); + rocks_db_opts.add_event_listener(RocksPersistenceListener::new(listener)); + } + let factory = + RangeCompactionFilterFactory::new(ctx.start_key.clone(), ctx.end_key.clone()); + let rocks_cfs_opts = cf_opts + .iter() + .map(|(name, opt)| { + let mut opt = get_rocks_cf_opts(opt); + // We assume `get_rocks_cf_opts` didn't set a factory already. + opt.set_compaction_filter_factory("range_filter_factory", factory.clone()) + .unwrap(); + (*name, opt) }) .collect(); rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) @@ -393,24 +445,19 @@ pub mod ctor { } impl RaftEngineConstructorExt for engine_rocks::RocksEngine { - fn new_raft_engine(path: &str, db_opt: Option) -> Result { + fn new_raft_engine(path: &str, db_opt: Option) -> Result { let rocks_db_opts = match db_opt { - Some(db_opt) => Some(get_rocks_db_opts(db_opt)?), - None => None, + Some(db_opt) => get_rocks_db_opts(db_opt)?, + None => RocksDbOptions::default(), }; - let cf_opts = CFOptions::new(engine_traits::CF_DEFAULT, ColumnFamilyOptions::new()); - let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); - set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); - set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - let default_cfs_opts = vec![RocksCFOptions::new(cf_opts.cf, rocks_cf_opts)]; - rocks_new_engine(path, rocks_db_opts, &[], Some(default_cfs_opts)) + let rocks_cf_opts = get_rocks_cf_opts(&CfOptions::new()); + let default_cfs_opts = vec![(CF_DEFAULT, rocks_cf_opts)]; + rocks_new_engine_opt(path, rocks_db_opts, default_cfs_opts) } } - fn set_standard_cf_opts( - rocks_cf_opts: &mut RawRocksColumnFamilyOptions, - cf_opts: &ColumnFamilyOptions, - ) { + fn get_rocks_cf_opts(cf_opts: &CfOptions) -> RocksCfOptions { + let mut rocks_cf_opts = RocksCfOptions::new(); if !cf_opts.get_no_range_properties() { rocks_cf_opts.add_table_properties_collector_factory( "tikv.range-properties-collector", @@ -423,30 +470,28 @@ pub mod ctor { MvccPropertiesCollectorFactory::default(), ); } - } - fn set_cf_opts( - rocks_cf_opts: &mut RocksColumnFamilyOptions, - cf_opts: &ColumnFamilyOptions, - ) { if let Some(trigger) = cf_opts.get_level_zero_file_num_compaction_trigger() { rocks_cf_opts.set_level_zero_file_num_compaction_trigger(trigger); } if let Some(trigger) = cf_opts.get_level_zero_slowdown_writes_trigger() { - rocks_cf_opts - .as_raw_mut() - .set_level_zero_slowdown_writes_trigger(trigger); + rocks_cf_opts.set_level_zero_slowdown_writes_trigger(trigger); } if cf_opts.get_disable_auto_compactions() { rocks_cf_opts.set_disable_auto_compactions(true); } + rocks_cf_opts } - fn get_rocks_db_opts(db_opts: DBOptions) -> Result { - let mut rocks_db_opts = RawRocksDBOptions::new(); + fn get_rocks_db_opts(db_opts: DbOptions) -> Result { + let mut rocks_db_opts = RocksDbOptions::default(); let env = get_env(db_opts.key_manager.clone(), db_opts.rate_limiter)?; rocks_db_opts.set_env(env); - let rocks_db_opts = RocksDBOptions::from_raw(rocks_db_opts); + if db_opts.enable_multi_batch_write { + rocks_db_opts.enable_unordered_write(false); + rocks_db_opts.enable_pipelined_write(false); + rocks_db_opts.enable_multi_batch_write(true); + } Ok(rocks_db_opts) } } @@ -455,10 +500,10 @@ pub mod ctor { use engine_traits::Result; use raft_log_engine::{RaftEngineConfig, RaftLogEngine}; - use super::{RaftDBOptions, RaftEngineConstructorExt}; + use super::{RaftDbOptions, RaftEngineConstructorExt}; impl RaftEngineConstructorExt for raft_log_engine::RaftLogEngine { - fn new_raft_engine(path: &str, db_opts: Option) -> Result { + fn new_raft_engine(path: &str, db_opts: Option) -> Result { let mut config = RaftEngineConfig::default(); config.dir = path.to_owned(); RaftLogEngine::new( @@ -479,13 +524,7 @@ pub fn new_temp_engine( ) -> engine_traits::Engines { let raft_path = path.path().join(std::path::Path::new("raft")); engine_traits::Engines::new( - crate::kv::new_engine( - path.path().to_str().unwrap(), - None, - engine_traits::ALL_CFS, - None, - ) - .unwrap(), + crate::kv::new_engine(path.path().to_str().unwrap(), engine_traits::ALL_CFS).unwrap(), crate::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(), ) } diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml new file mode 100644 index 00000000000..b3cac78b502 --- /dev/null +++ b/components/engine_tirocks/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "engine_tirocks" +version = "0.1.0" +edition = "2021" + +[dependencies] +api_version = { workspace = true } +codec = { workspace = true } +collections = { workspace = true } +derive_more = "0.99.3" +engine_traits = { workspace = true } +keys = { workspace = true } +lazy_static = "1.4.0" +log_wrappers = { workspace = true } +prometheus = { version = "0.13", features = ["nightly"] } +prometheus-static-metric = "0.5" +slog = { workspace = true } +slog-global = { workspace = true } +slog_derive = "0.2" +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +tirocks = { git = "https://github.com/busyjay/tirocks.git", branch = "dev" } +tracker = { workspace = true } +txn_types = { workspace = true } + +[dev-dependencies] +kvproto = { workspace = true } +rand = "0.8" +tempfile = "3.0" diff --git a/components/engine_tirocks/src/cf_options.rs b/components/engine_tirocks/src/cf_options.rs new file mode 100644 index 00000000000..fe26a6b1056 --- /dev/null +++ b/components/engine_tirocks/src/cf_options.rs @@ -0,0 +1,170 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + mem, + ops::{Deref, DerefMut}, +}; + +use tirocks::{ + option::{RawCfOptions, TitanCfOptions}, + CfOptions, +}; + +enum Options { + Rocks(CfOptions), + Titan(TitanCfOptions), + // Only used for replace. + None, +} + +pub struct RocksCfOptions(Options); + +impl RocksCfOptions { + #[inline] + pub fn is_titan(&self) -> bool { + matches!(self.0, Options::Titan(_)) + } + + #[inline] + pub fn default_titan() -> Self { + RocksCfOptions(Options::Titan(Default::default())) + } + + #[inline] + pub(crate) fn into_rocks(self) -> CfOptions { + match self.0 { + Options::Rocks(opt) => opt, + _ => panic!("it's a titan cf option"), + } + } + + #[inline] + pub(crate) fn into_titan(self) -> TitanCfOptions { + match self.0 { + Options::Titan(opt) => opt, + _ => panic!("it's not a titan cf option"), + } + } +} + +impl Default for RocksCfOptions { + #[inline] + fn default() -> Self { + RocksCfOptions(Options::Rocks(Default::default())) + } +} + +impl Deref for RocksCfOptions { + type Target = RawCfOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + match &self.0 { + Options::Rocks(opt) => opt, + Options::Titan(opt) => opt, + Options::None => unreachable!(), + } + } +} + +impl DerefMut for RocksCfOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + match &mut self.0 { + Options::Rocks(opt) => opt, + Options::Titan(opt) => opt, + Options::None => unreachable!(), + } + } +} + +impl engine_traits::TitanCfOptions for RocksCfOptions { + fn new() -> Self { + // TODO: should use accessor of CfOptions instead. + panic!() + } + + fn set_min_blob_size(&mut self, size: u64) { + if let Options::Titan(opt) = &mut self.0 { + opt.set_min_blob_size(size); + return; + } + if let Options::Rocks(r) = mem::replace(&mut self.0, Options::None) { + let mut opt: TitanCfOptions = r.into(); + opt.set_min_blob_size(size); + self.0 = Options::Titan(opt); + return; + } + unreachable!() + } +} + +impl engine_traits::CfOptions for RocksCfOptions { + type TitanCfOptions = Self; + + #[inline] + fn new() -> Self { + Self::default() + } + + #[inline] + fn get_max_write_buffer_number(&self) -> u32 { + self.max_write_buffer_number() as u32 + } + + fn get_level_zero_slowdown_writes_trigger(&self) -> i32 { + self.level0_slowdown_writes_trigger() + } + + fn get_level_zero_stop_writes_trigger(&self) -> i32 { + self.level0_stop_writes_trigger() + } + + fn set_level_zero_file_num_compaction_trigger(&mut self, v: i32) { + self.set_level0_file_num_compaction_trigger(v); + } + + fn get_soft_pending_compaction_bytes_limit(&self) -> u64 { + self.soft_pending_compaction_bytes_limit() + } + + fn get_hard_pending_compaction_bytes_limit(&self) -> u64 { + self.hard_pending_compaction_bytes_limit() + } + + fn get_block_cache_capacity(&self) -> u64 { + // TODO: block cache should be managed by global shared resource. + panic!() + } + + fn set_block_cache_capacity(&self, _: u64) -> engine_traits::Result<()> { + // TODO: block cache should be managed by global shared resource. + panic!() + } + + fn set_titan_cf_options(&mut self, _: &Self::TitanCfOptions) { + // TODO: change to use mut accessor instead of setter. + panic!() + } + + fn get_target_file_size_base(&self) -> u64 { + self.target_file_size_base() + } + + fn set_disable_auto_compactions(&mut self, v: bool) { + (**self).set_disable_auto_compactions(v); + } + + fn get_disable_auto_compactions(&self) -> bool { + self.disable_auto_compactions() + } + + fn get_disable_write_stall(&self) -> bool { + self.disable_write_stall() + } + + fn set_sst_partitioner_factory(&mut self, _: F) { + // TODO: It should be shared. + panic!() + } +} diff --git a/components/engine_tirocks/src/db_options.rs b/components/engine_tirocks/src/db_options.rs new file mode 100644 index 00000000000..e44d1fb6269 --- /dev/null +++ b/components/engine_tirocks/src/db_options.rs @@ -0,0 +1,79 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + ops::{Deref, DerefMut}, + sync::Arc, +}; + +use tirocks::{ + env::Env, + option::{RawDbOptions, TitanDbOptions}, + DbOptions, +}; + +enum Options { + Rocks(DbOptions), + Titan(TitanDbOptions), +} + +pub struct RocksDbOptions(Options); + +impl RocksDbOptions { + #[inline] + pub fn env(&self) -> Option<&Arc> { + match &self.0 { + Options::Rocks(opt) => opt.env(), + Options::Titan(opt) => opt.env(), + } + } + + #[inline] + pub fn is_titan(&self) -> bool { + matches!(self.0, Options::Titan(_)) + } + + #[inline] + pub(crate) fn into_rocks(self) -> DbOptions { + match self.0 { + Options::Rocks(opt) => opt, + _ => panic!("it's a titan option"), + } + } + + #[inline] + pub(crate) fn into_titan(self) -> TitanDbOptions { + match self.0 { + Options::Titan(opt) => opt, + _ => panic!("it's not a titan option"), + } + } +} + +impl Default for RocksDbOptions { + #[inline] + fn default() -> Self { + RocksDbOptions(Options::Rocks(Default::default())) + } +} + +impl Deref for RocksDbOptions { + type Target = RawDbOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + match &self.0 { + Options::Rocks(opt) => opt, + Options::Titan(opt) => opt, + } + } +} + +impl DerefMut for RocksDbOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + match &mut self.0 { + Options::Rocks(opt) => opt, + Options::Titan(opt) => opt, + } + } +} diff --git a/components/engine_tirocks/src/db_vector.rs b/components/engine_tirocks/src/db_vector.rs new file mode 100644 index 00000000000..67a7609ac15 --- /dev/null +++ b/components/engine_tirocks/src/db_vector.rs @@ -0,0 +1,35 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Debug, Formatter}, + ops::Deref, +}; + +use tirocks::PinSlice; + +#[derive(Default)] +pub struct RocksPinSlice(pub(crate) PinSlice); + +impl engine_traits::DbVector for RocksPinSlice {} + +impl Deref for RocksPinSlice { + type Target = [u8]; + + #[inline] + fn deref(&self) -> &[u8] { + &self.0 + } +} + +impl Debug for RocksPinSlice { + fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { + write!(formatter, "{:?}", &**self) + } +} + +impl<'a> PartialEq<&'a [u8]> for RocksPinSlice { + #[inline] + fn eq(&self, rhs: &&[u8]) -> bool { + **rhs == **self + } +} diff --git a/components/engine_tirocks/src/engine.rs b/components/engine_tirocks/src/engine.rs new file mode 100644 index 00000000000..c3f99cafcc6 --- /dev/null +++ b/components/engine_tirocks/src/engine.rs @@ -0,0 +1,334 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fs, path::Path, sync::Arc}; + +use engine_traits::{Code, Error, Result, Status}; +use tirocks::{ + db::RawCfHandle, + option::{ReadOptions, WriteOptions}, + Db, Iterator, +}; + +use crate::{ + db_vector::RocksPinSlice, engine_iterator, r2e, util, RocksEngineIterator, RocksSnapshot, +}; + +#[derive(Clone, Debug)] +pub struct RocksEngine { + db: Arc, + // TODO: always enable and remove following flag + multi_batch_write: bool, +} + +impl RocksEngine { + #[inline] + pub(crate) fn new(db: Arc) -> Self { + RocksEngine { + multi_batch_write: db.db_options().multi_batch_write(), + db, + } + } + + #[inline] + pub fn exists(path: impl AsRef) -> Result { + let path = path.as_ref(); + if !path.exists() || !path.is_dir() { + return Ok(false); + } + let current_file_path = path.join("CURRENT"); + if current_file_path.exists() && current_file_path.is_file() { + return Ok(true); + } + + // If path is not an empty directory, we say db exists. If path is not an empty + // directory but db has not been created, `DB::list_column_families` fails and + // we can clean up the directory by this indication. + if fs::read_dir(&path).unwrap().next().is_some() { + Err(Error::Engine(Status::with_code(Code::Corruption))) + } else { + Ok(false) + } + } + + #[inline] + pub(crate) fn as_inner(&self) -> &Arc { + &self.db + } + + #[inline] + pub fn cf(&self, name: &str) -> Result<&RawCfHandle> { + util::cf_handle(&self.db, name) + } + + #[inline] + fn get( + &self, + opts: &engine_traits::ReadOptions, + handle: &RawCfHandle, + key: &[u8], + ) -> Result> { + let mut opt = ReadOptions::default(); + opt.set_fill_cache(opts.fill_cache()); + // TODO: reuse slice. + let mut slice = RocksPinSlice::default(); + match self.db.get_pinned(&opt, handle, key, &mut slice.0) { + Ok(true) => Ok(Some(slice)), + Ok(false) => Ok(None), + Err(s) => Err(r2e(s)), + } + } + + #[inline] + fn snapshot(&self) -> RocksSnapshot { + RocksSnapshot::new(self.db.clone()) + } + + #[inline] + pub(crate) fn multi_batch_write(&self) -> bool { + self.multi_batch_write + } + + #[inline] + pub(crate) fn approximate_memtable_stats( + &self, + cf: &str, + start: &[u8], + end: &[u8], + ) -> Result<(u64, u64)> { + let handle = self.cf(cf)?; + Ok(self + .as_inner() + .approximate_mem_table_stats(handle, start, end)) + } + + // TODO: move this function when MiscExt is implemented. + #[cfg(test)] + pub(crate) fn flush(&self, cf: &str, wait: bool) -> Result<()> { + use tirocks::option::FlushOptions; + + let write_handle = self.cf(cf)?; + self.as_inner() + .flush(FlushOptions::default().set_wait(wait), write_handle) + .map_err(r2e) + } +} + +impl engine_traits::Iterable for RocksEngine { + type Iterator = RocksEngineIterator; + + fn iterator_opt(&self, cf: &str, opts: engine_traits::IterOptions) -> Result { + let opt = engine_iterator::to_tirocks_opt(opts); + let handle = self.cf(cf)?; + Ok(RocksEngineIterator::from_raw(Iterator::new( + self.db.clone(), + opt, + handle, + ))) + } +} + +impl engine_traits::Peekable for RocksEngine { + type DbVector = RocksPinSlice; + + #[inline] + fn get_value_opt( + &self, + opts: &engine_traits::ReadOptions, + key: &[u8], + ) -> Result> { + self.get(opts, self.db.default_cf(), key) + } + + #[inline] + fn get_value_cf_opt( + &self, + opts: &engine_traits::ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + let handle = self.cf(cf)?; + self.get(opts, handle, key) + } +} + +impl engine_traits::SyncMutable for RocksEngine { + fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + let handle = self.db.default_cf(); + self.db + .put(&WriteOptions::default(), handle, key, value) + .map_err(r2e) + } + + fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + let handle = self.cf(cf)?; + self.db + .put(&WriteOptions::default(), handle, key, value) + .map_err(r2e) + } + + fn delete(&self, key: &[u8]) -> Result<()> { + let handle = self.db.default_cf(); + self.db + .delete(&WriteOptions::default(), handle, key) + .map_err(r2e) + } + + fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { + let handle = self.cf(cf)?; + self.db + .delete(&WriteOptions::default(), handle, key) + .map_err(r2e) + } + + fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + let handle = self.db.default_cf(); + self.db + .delete_range(&WriteOptions::default(), handle, begin_key, end_key) + .map_err(r2e) + } + + fn delete_range_cf(&self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + let handle = self.cf(cf)?; + self.db + .delete_range(&WriteOptions::default(), handle, begin_key, end_key) + .map_err(r2e) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{Iterable, Peekable, SyncMutable, CF_DEFAULT}; + use kvproto::metapb::Region; + use tempfile::Builder; + + use crate::util; + + #[test] + fn test_base() { + let path = Builder::new().prefix("var").tempdir().unwrap(); + let cf = "cf"; + let engine = util::new_engine(path.path(), &[CF_DEFAULT, cf]).unwrap(); + + let mut r = Region::default(); + r.set_id(10); + + let key = b"key"; + engine.put_msg(key, &r).unwrap(); + engine.put_msg_cf(cf, key, &r).unwrap(); + + let snap = engine.snapshot(); + + let mut r1: Region = engine.get_msg(key).unwrap().unwrap(); + assert_eq!(r, r1); + let r1_cf: Region = engine.get_msg_cf(cf, key).unwrap().unwrap(); + assert_eq!(r, r1_cf); + + let mut r2: Region = snap.get_msg(key).unwrap().unwrap(); + assert_eq!(r, r2); + let r2_cf: Region = snap.get_msg_cf(cf, key).unwrap().unwrap(); + assert_eq!(r, r2_cf); + + r.set_id(11); + engine.put_msg(key, &r).unwrap(); + r1 = engine.get_msg(key).unwrap().unwrap(); + r2 = snap.get_msg(key).unwrap().unwrap(); + assert_ne!(r1, r2); + + let b: Option = engine.get_msg(b"missing_key").unwrap(); + assert!(b.is_none()); + } + + #[test] + fn test_peekable() { + let path = Builder::new().prefix("var").tempdir().unwrap(); + let cf = "cf"; + let engine = util::new_engine(path.path(), &[CF_DEFAULT, cf]).unwrap(); + + engine.put(b"k1", b"v1").unwrap(); + engine.put_cf(cf, b"k1", b"v2").unwrap(); + + assert_eq!(&*engine.get_value(b"k1").unwrap().unwrap(), b"v1"); + engine.get_value_cf("foo", b"k1").unwrap_err(); + assert_eq!(&*engine.get_value_cf(cf, b"k1").unwrap().unwrap(), b"v2"); + } + + #[test] + fn test_scan() { + let path = Builder::new().prefix("var").tempdir().unwrap(); + let cf = "cf"; + let engine = util::new_engine(path.path(), &[CF_DEFAULT, cf]).unwrap(); + + engine.put(b"a1", b"v1").unwrap(); + engine.put(b"a2", b"v2").unwrap(); + engine.put_cf(cf, b"a1", b"v1").unwrap(); + engine.put_cf(cf, b"a2", b"v22").unwrap(); + + let mut data = vec![]; + engine + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { + data.push((key.to_vec(), value.to_vec())); + Ok(true) + }) + .unwrap(); + assert_eq!( + data, + vec![ + (b"a1".to_vec(), b"v1".to_vec()), + (b"a2".to_vec(), b"v2".to_vec()), + ] + ); + data.clear(); + + engine + .scan(cf, b"", &[0xFF, 0xFF], false, |key, value| { + data.push((key.to_vec(), value.to_vec())); + Ok(true) + }) + .unwrap(); + assert_eq!( + data, + vec![ + (b"a1".to_vec(), b"v1".to_vec()), + (b"a2".to_vec(), b"v22".to_vec()), + ] + ); + data.clear(); + + let pair = engine.seek(CF_DEFAULT, b"a1").unwrap().unwrap(); + assert_eq!(pair, (b"a1".to_vec(), b"v1".to_vec())); + assert!(engine.seek(CF_DEFAULT, b"a3").unwrap().is_none()); + let pair_cf = engine.seek(cf, b"a1").unwrap().unwrap(); + assert_eq!(pair_cf, (b"a1".to_vec(), b"v1".to_vec())); + assert!(engine.seek(cf, b"a3").unwrap().is_none()); + + let mut index = 0; + engine + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { + data.push((key.to_vec(), value.to_vec())); + index += 1; + Ok(index != 1) + }) + .unwrap(); + + assert_eq!(data.len(), 1); + + let snap = engine.snapshot(); + + engine.put(b"a3", b"v3").unwrap(); + assert!(engine.seek(CF_DEFAULT, b"a3").unwrap().is_some()); + + let pair = snap.seek(CF_DEFAULT, b"a1").unwrap().unwrap(); + assert_eq!(pair, (b"a1".to_vec(), b"v1".to_vec())); + assert!(snap.seek(CF_DEFAULT, b"a3").unwrap().is_none()); + + data.clear(); + + snap.scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { + data.push((key.to_vec(), value.to_vec())); + Ok(true) + }) + .unwrap(); + + assert_eq!(data.len(), 2); + } +} diff --git a/components/engine_tirocks/src/engine_iterator.rs b/components/engine_tirocks/src/engine_iterator.rs new file mode 100644 index 00000000000..37ce3bb8046 --- /dev/null +++ b/components/engine_tirocks/src/engine_iterator.rs @@ -0,0 +1,188 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::Result; +use tikv_util::codec::number; +use tirocks::{ + option::ReadOptions, properties::table::builtin::TableProperties, table_filter::TableFilter, + Db, Iterator, Snapshot, +}; + +use crate::r2e; + +pub struct RocksIterator<'a, D>(Iterator<'a, D>); + +impl<'a, D> RocksIterator<'a, D> { + pub fn from_raw(iter: Iterator<'a, D>) -> Self { + RocksIterator(iter) + } + + pub fn sequence(&self) -> Option { + self.0.sequence_number() + } +} + +impl<'a, D: Send> engine_traits::Iterator for RocksIterator<'a, D> { + #[inline] + fn seek(&mut self, key: &[u8]) -> Result { + self.0.seek(key); + self.valid() + } + + #[inline] + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + self.0.seek_for_prev(key); + self.valid() + } + + #[inline] + fn seek_to_first(&mut self) -> Result { + self.0.seek_to_first(); + self.valid() + } + + #[inline] + fn seek_to_last(&mut self) -> Result { + self.0.seek_to_last(); + self.valid() + } + + #[inline] + fn prev(&mut self) -> Result { + #[cfg(not(feature = "nortcheck"))] + if !self.valid()? { + return Err(r2e(tirocks::Status::with_code( + tirocks::Code::kInvalidArgument, + ))); + } + self.0.prev(); + self.valid() + } + + #[inline] + fn next(&mut self) -> Result { + #[cfg(not(feature = "nortcheck"))] + if !self.valid()? { + return Err(r2e(tirocks::Status::with_code( + tirocks::Code::kInvalidArgument, + ))); + } + self.0.next(); + self.valid() + } + + #[inline] + fn key(&self) -> &[u8] { + #[cfg(not(feature = "nortcheck"))] + assert!(self.valid().unwrap()); + self.0.key() + } + + #[inline] + fn value(&self) -> &[u8] { + #[cfg(not(feature = "nortcheck"))] + assert!(self.valid().unwrap()); + self.0.value() + } + + #[inline] + fn valid(&self) -> Result { + if self.0.valid() { + Ok(true) + } else { + self.0.check().map_err(r2e)?; + Ok(false) + } + } +} + +/// A filter that will only read blocks which have versions overlapping with +/// [`hint_min_ts, `hint_max_ts`]. +struct TsFilter { + hint_min_ts: Option, + hint_max_ts: Option, +} + +impl TsFilter { + fn new(hint_min_ts: Option, hint_max_ts: Option) -> TsFilter { + TsFilter { + hint_min_ts, + hint_max_ts, + } + } +} + +impl TableFilter for TsFilter { + fn filter(&self, props: &TableProperties) -> bool { + if self.hint_max_ts.is_none() && self.hint_min_ts.is_none() { + return true; + } + + let user_props = props.user_collected_properties(); + + if let Some(hint_min_ts) = self.hint_min_ts { + // TODO avoid hard code after refactor MvccProperties from + // tikv/src/raftstore/coprocessor/ into some component about engine. + if let Some(mut p) = user_props.get("tikv.max_ts") { + if let Ok(get_max) = number::decode_u64(&mut p) { + if get_max < hint_min_ts { + return false; + } + } + } + } + + if let Some(hint_max_ts) = self.hint_max_ts { + // TODO avoid hard code after refactor MvccProperties from + // tikv/src/raftstore/coprocessor/ into some component about engine. + if let Some(mut p) = user_props.get("tikv.min_ts") { + if let Ok(get_min) = number::decode_u64(&mut p) { + if get_min > hint_max_ts { + return false; + } + } + } + } + + true + } +} + +/// Convert an `IterOptions` to rocksdb `ReadOptions`. +pub fn to_tirocks_opt(iter_opt: engine_traits::IterOptions) -> ReadOptions { + let mut opt = ReadOptions::default(); + opt.set_fill_cache(iter_opt.fill_cache()) + .set_max_skippable_internal_keys(iter_opt.max_skippable_internal_keys()); + if iter_opt.key_only() { + opt.set_key_only(true); + } + if iter_opt.total_order_seek_used() { + opt.set_total_order_seek(true); + // TODO: enable it. + opt.set_auto_prefix_mode(false); + } else if iter_opt.prefix_same_as_start() { + opt.set_prefix_same_as_start(true); + } + // TODO: enable it. + opt.set_adaptive_readahead(false); + + if iter_opt.hint_min_ts().is_some() || iter_opt.hint_max_ts().is_some() { + opt.set_table_filter(TsFilter::new( + iter_opt.hint_min_ts(), + iter_opt.hint_max_ts(), + )); + } + + let (lower, upper) = iter_opt.build_bounds(); + if let Some(lower) = lower { + opt.set_iterate_lower_bound(lower); + } + if let Some(upper) = upper { + opt.set_iterate_upper_bound(upper); + } + opt +} + +pub type RocksEngineIterator = RocksIterator<'static, Arc>; +pub type RocksSnapIterator = RocksIterator<'static, Arc>>>; diff --git a/components/engine_tirocks/src/lib.rs b/components/engine_tirocks/src/lib.rs new file mode 100644 index 00000000000..ecf7035b8c4 --- /dev/null +++ b/components/engine_tirocks/src/lib.rs @@ -0,0 +1,35 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! A new implementation of engine_traits using tirocks. +//! +//! When all features of engine_rocks are implemented in this module, +//! engine_rocks will be removed and TiKV will switch to tirocks. + +#![cfg_attr(test, feature(test))] + +extern crate tikv_alloc as _; + +#[cfg(test)] +extern crate test; + +mod cf_options; +mod db_options; +mod db_vector; +mod engine; +mod engine_iterator; +mod logger; +mod perf_context; +mod properties; +mod snapshot; +mod status; +mod util; +mod write_batch; + +pub use engine::*; +pub use engine_iterator::*; +pub use logger::*; +pub use perf_context::*; +pub use properties::*; +pub use snapshot::RocksSnapshot; +pub use status::*; +pub use util::*; diff --git a/components/engine_tirocks/src/logger.rs b/components/engine_tirocks/src/logger.rs new file mode 100644 index 00000000000..2144577ddbf --- /dev/null +++ b/components/engine_tirocks/src/logger.rs @@ -0,0 +1,62 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use tikv_util::{crit, debug, error, info, warn}; +use tirocks::env::logger::{LogLevel, Logger}; + +pub struct RocksDbLogger; + +impl Logger for RocksDbLogger { + #[inline] + fn logv(&self, log_level: LogLevel, data: &[u8]) { + match log_level { + LogLevel::HEADER_LEVEL => { + info!(#"rocksdb_log_header", "{}", String::from_utf8_lossy(data)); + } + LogLevel::DEBUG_LEVEL => { + debug!(#"rocksdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::INFO_LEVEL => { + info!(#"rocksdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::WARN_LEVEL => { + warn!(#"rocksdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::ERROR_LEVEL => { + error!(#"rocksdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::FATAL_LEVEL => { + crit!(#"rocksdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::NUM_INFO_LOG_LEVELS => (), + } + } +} + +pub struct RaftDbLogger; + +impl Logger for RaftDbLogger { + #[inline] + fn logv(&self, log_level: LogLevel, data: &[u8]) { + match log_level { + LogLevel::HEADER_LEVEL => { + info!(#"raftdb_log_header", "{}", String::from_utf8_lossy(data)); + } + LogLevel::DEBUG_LEVEL => { + debug!(#"raftdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::INFO_LEVEL => { + info!(#"raftdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::WARN_LEVEL => { + warn!(#"raftdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::ERROR_LEVEL => { + error!(#"raftdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::FATAL_LEVEL => { + crit!(#"raftdb_log", "{}", String::from_utf8_lossy(data)); + } + LogLevel::NUM_INFO_LOG_LEVELS => (), + } + } +} diff --git a/components/engine_tirocks/src/perf_context.rs b/components/engine_tirocks/src/perf_context.rs new file mode 100644 index 00000000000..643967230df --- /dev/null +++ b/components/engine_tirocks/src/perf_context.rs @@ -0,0 +1,669 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fmt::Debug, marker::PhantomData, mem, ops::Sub, time::Duration}; + +use derive_more::{Add, AddAssign, Sub, SubAssign}; +use lazy_static::lazy_static; +use prometheus::*; +use prometheus_static_metric::*; +use slog_derive::KV; +use tikv_util::time::Instant; +use tirocks::perf_context::{set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags}; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; + +use crate::{util, RocksEngine}; + +macro_rules! report_write_perf_context { + ($ctx:expr, $metric:ident) => { + if $ctx.perf_level != engine_traits::PerfLevel::Disable { + $ctx.write = WritePerfContext::capture(); + observe_write_time!($ctx, $metric, write_wal_time); + observe_write_time!($ctx, $metric, write_memtable_time); + observe_write_time!($ctx, $metric, db_mutex_lock_nanos); + observe_write_time!($ctx, $metric, pre_and_post_process); + observe_write_time!($ctx, $metric, write_thread_wait); + observe_write_time!($ctx, $metric, write_scheduling_flushes_compactions_time); + observe_write_time!($ctx, $metric, db_condition_wait_nanos); + observe_write_time!($ctx, $metric, write_delay_time); + } + }; +} + +macro_rules! observe_write_time { + ($ctx:expr, $metric:expr, $v:ident) => { + $metric.$v.observe(($ctx.write.$v) as f64 / 1e9); + }; +} + +make_auto_flush_static_metric! { + pub label_enum PerfContextType { + write_wal_time, + write_delay_time, + write_scheduling_flushes_compactions_time, + db_condition_wait_nanos, + write_memtable_time, + pre_and_post_process, + write_thread_wait, + db_mutex_lock_nanos, + } + + pub struct PerfContextTimeDuration : LocalHistogram { + "type" => PerfContextType + } +} + +lazy_static! { + pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = register_histogram_vec!( + "tikv_raftstore_apply_perf_context_time_duration_secs", + "Bucketed histogram of request wait time duration.", + &["type"], + exponential_buckets(0.00001, 2.0, 26).unwrap() + ) + .unwrap(); + pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = register_histogram_vec!( + "tikv_raftstore_store_perf_context_time_duration_secs", + "Bucketed histogram of request wait time duration.", + &["type"], + exponential_buckets(0.00001, 2.0, 26).unwrap() + ) + .unwrap(); + pub static ref STORAGE_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( + "tikv_storage_rocksdb_perf", + "Total number of RocksDB internal operations from PerfContext", + &["req", "metric"] + ) + .unwrap(); + pub static ref COPR_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( + "tikv_coprocessor_rocksdb_perf", + "Total number of RocksDB internal operations from PerfContext", + &["req", "metric"] + ) + .unwrap(); + pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration = + auto_flush_from!(APPLY_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); + pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration = + auto_flush_from!(STORE_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); + + + /// Default perf flags for a write operation. + static ref DEFAULT_WRITE_PERF_FLAGS: PerfFlags = PerfFlags::default() + | PerfFlag::write_wal_time + | PerfFlag::write_pre_and_post_process_time + | PerfFlag::write_memtable_time + | PerfFlag::write_thread_wait_nanos + | PerfFlag::db_mutex_lock_nanos + | PerfFlag::write_scheduling_flushes_compactions_time + | PerfFlag::db_condition_wait_nanos + | PerfFlag::write_delay_time; + + /// Default perf flags for read operations. + static ref DEFAULT_READ_PERF_FLAGS: PerfFlags = PerfFlags::default() + | PerfFlag::user_key_comparison_count + | PerfFlag::block_cache_hit_count + | PerfFlag::block_read_count + | PerfFlag::block_read_byte + | PerfFlag::block_read_time + | PerfFlag::block_cache_index_hit_count + | PerfFlag::index_block_read_count + | PerfFlag::block_cache_filter_hit_count + | PerfFlag::filter_block_read_count + | PerfFlag::compression_dict_block_read_count + | PerfFlag::get_read_bytes + | PerfFlag::internal_key_skipped_count + | PerfFlag::internal_delete_skipped_count + | PerfFlag::internal_recent_skipped_count + | PerfFlag::get_snapshot_time + | PerfFlag::get_from_memtable_count + | PerfFlag::seek_on_memtable_count + | PerfFlag::next_on_memtable_count + | PerfFlag::prev_on_memtable_count + | PerfFlag::seek_child_seek_count + | PerfFlag::db_mutex_lock_nanos + | PerfFlag::db_condition_wait_nanos + | PerfFlag::bloom_memtable_hit_count + | PerfFlag::bloom_memtable_miss_count + | PerfFlag::bloom_sst_hit_count + | PerfFlag::bloom_sst_miss_count + | PerfFlag::user_key_return_count + | PerfFlag::block_cache_miss_count + | PerfFlag::bloom_filter_full_positive + | PerfFlag::bloom_filter_useful + | PerfFlag::bloom_filter_full_true_positive + | PerfFlag::bytes_read; +} + +impl engine_traits::PerfContextExt for RocksEngine { + type PerfContext = RocksPerfContext; + + fn get_perf_context( + level: engine_traits::PerfLevel, + kind: engine_traits::PerfContextKind, + ) -> Self::PerfContext { + RocksPerfContext::new(level, kind) + } +} + +#[derive(Debug)] +pub struct RocksPerfContext { + pub stats: PerfContextStatistics, +} + +impl RocksPerfContext { + pub fn new(level: engine_traits::PerfLevel, kind: engine_traits::PerfContextKind) -> Self { + RocksPerfContext { + stats: PerfContextStatistics::new(level, kind), + } + } +} + +impl engine_traits::PerfContext for RocksPerfContext { + fn start_observe(&mut self) { + self.stats.start() + } + + fn report_metrics(&mut self, trackers: &[TrackerToken]) { + self.stats.report(trackers) + } +} + +#[derive(Default, Debug, Clone, Copy, Add, AddAssign, Sub, SubAssign, KV)] +pub struct ReadPerfContext { + pub user_key_comparison_count: u64, + pub block_cache_hit_count: u64, + pub block_read_count: u64, + pub block_read_byte: u64, + pub block_read_time: u64, + pub block_cache_index_hit_count: u64, + pub index_block_read_count: u64, + pub block_cache_filter_hit_count: u64, + pub filter_block_read_count: u64, + pub block_checksum_time: u64, + pub block_decompress_time: u64, + pub get_read_bytes: u64, + pub iter_read_bytes: u64, + pub internal_key_skipped_count: u64, + pub internal_delete_skipped_count: u64, + pub internal_recent_skipped_count: u64, + pub get_snapshot_time: u64, + pub get_from_memtable_time: u64, + pub get_from_memtable_count: u64, + pub get_post_process_time: u64, + pub get_from_output_files_time: u64, + pub seek_on_memtable_time: u64, + pub seek_on_memtable_count: u64, + pub next_on_memtable_count: u64, + pub prev_on_memtable_count: u64, + pub seek_child_seek_time: u64, + pub seek_child_seek_count: u64, + pub seek_min_heap_time: u64, + pub seek_max_heap_time: u64, + pub seek_internal_seek_time: u64, + pub db_mutex_lock_nanos: u64, + pub db_condition_wait_nanos: u64, + pub read_index_block_nanos: u64, + pub read_filter_block_nanos: u64, + pub new_table_block_iter_nanos: u64, + pub new_table_iterator_nanos: u64, + pub block_seek_nanos: u64, + pub find_table_nanos: u64, + pub bloom_memtable_hit_count: u64, + pub bloom_memtable_miss_count: u64, + pub bloom_sst_hit_count: u64, + pub bloom_sst_miss_count: u64, + pub get_cpu_nanos: u64, + pub iter_next_cpu_nanos: u64, + pub iter_prev_cpu_nanos: u64, + pub iter_seek_cpu_nanos: u64, + pub encrypt_data_nanos: u64, + pub decrypt_data_nanos: u64, +} + +impl ReadPerfContext { + fn report_to_tracker(&self, tracker: &mut Tracker) { + tracker.metrics.block_cache_hit_count += self.block_cache_hit_count; + tracker.metrics.block_read_byte += self.block_read_byte; + tracker.metrics.block_read_count += self.block_read_count; + tracker.metrics.block_read_nanos += self.block_read_time; + tracker.metrics.deleted_key_skipped_count += self.internal_delete_skipped_count; + tracker.metrics.internal_key_skipped_count += self.internal_key_skipped_count; + } +} + +#[derive(Default, Debug, Clone, Copy, Add, AddAssign, Sub, SubAssign, KV)] +pub struct WritePerfContext { + pub write_wal_time: u64, + pub pre_and_post_process: u64, + pub write_memtable_time: u64, + pub write_thread_wait: u64, + pub db_mutex_lock_nanos: u64, + pub write_scheduling_flushes_compactions_time: u64, + pub db_condition_wait_nanos: u64, + pub write_delay_time: u64, +} + +#[derive(Debug)] +pub struct PerfContextStatistics { + perf_level: engine_traits::PerfLevel, + kind: engine_traits::PerfContextKind, + read: ReadPerfContext, + write: WritePerfContext, + last_flush_time: Instant, +} + +const FLUSH_METRICS_INTERVAL: Duration = Duration::from_secs(2); + +impl PerfContextStatistics { + /// Create an instance which stores instant statistics values, retrieved at + /// creation. + pub fn new(perf_level: engine_traits::PerfLevel, kind: engine_traits::PerfContextKind) -> Self { + PerfContextStatistics { + perf_level, + kind, + read: Default::default(), + write: Default::default(), + last_flush_time: Instant::now_coarse(), + } + } + + fn apply_perf_settings(&self) { + if self.perf_level == engine_traits::PerfLevel::Uninitialized { + match self.kind { + engine_traits::PerfContextKind::Storage(_) + | engine_traits::PerfContextKind::Coprocessor(_) => { + set_perf_flags(&DEFAULT_READ_PERF_FLAGS) + } + engine_traits::PerfContextKind::RaftstoreStore + | engine_traits::PerfContextKind::RaftstoreApply => { + set_perf_flags(&DEFAULT_WRITE_PERF_FLAGS) + } + } + } else { + set_perf_level(util::to_rocks_perf_level(self.perf_level)); + } + } + + pub fn start(&mut self) { + if self.perf_level == engine_traits::PerfLevel::Disable { + return; + } + let mut ctx = PerfContext::get(); + ctx.reset(); + self.apply_perf_settings(); + } + + pub fn report(&mut self, trackers: &[TrackerToken]) { + match self.kind { + engine_traits::PerfContextKind::RaftstoreApply => { + report_write_perf_context!(self, APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.apply_mutex_lock_nanos = self.write.db_mutex_lock_nanos; + t.metrics.apply_thread_wait_nanos = self.write.write_thread_wait; + t.metrics.apply_write_wal_nanos = self.write.write_wal_time; + t.metrics.apply_write_memtable_nanos = self.write.write_memtable_time; + }); + } + } + engine_traits::PerfContextKind::RaftstoreStore => { + report_write_perf_context!(self, STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.store_mutex_lock_nanos = self.write.db_mutex_lock_nanos; + t.metrics.store_thread_wait_nanos = self.write.write_thread_wait; + t.metrics.store_write_wal_nanos = self.write.write_wal_time; + t.metrics.store_write_memtable_nanos = self.write.write_memtable_time; + }); + } + } + engine_traits::PerfContextKind::Storage(_) + | engine_traits::PerfContextKind::Coprocessor(_) => { + let perf_context = ReadPerfContext::capture(); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| perf_context.report_to_tracker(t)); + } + self.read += perf_context; + self.maybe_flush_read_metrics(); + } + } + } + + fn maybe_flush_read_metrics(&mut self) { + if self.last_flush_time.saturating_elapsed() < FLUSH_METRICS_INTERVAL { + return; + } + self.last_flush_time = Instant::now_coarse(); + let ctx = mem::take(&mut self.read); + let (v, tag) = match self.kind { + engine_traits::PerfContextKind::Storage(tag) => (&*STORAGE_ROCKSDB_PERF_COUNTER, tag), + engine_traits::PerfContextKind::Coprocessor(tag) => (&*COPR_ROCKSDB_PERF_COUNTER, tag), + _ => unreachable!(), + }; + v.get_metric_with_label_values(&[tag, "user_key_comparison_count"]) + .unwrap() + .inc_by(ctx.user_key_comparison_count); + v.get_metric_with_label_values(&[tag, "block_cache_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_hit_count); + v.get_metric_with_label_values(&[tag, "block_read_count"]) + .unwrap() + .inc_by(ctx.block_read_count); + v.get_metric_with_label_values(&[tag, "block_read_byte"]) + .unwrap() + .inc_by(ctx.block_read_byte); + v.get_metric_with_label_values(&[tag, "block_read_time"]) + .unwrap() + .inc_by(ctx.block_read_time); + v.get_metric_with_label_values(&[tag, "block_cache_index_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_index_hit_count); + v.get_metric_with_label_values(&[tag, "index_block_read_count"]) + .unwrap() + .inc_by(ctx.index_block_read_count); + v.get_metric_with_label_values(&[tag, "block_cache_filter_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_filter_hit_count); + v.get_metric_with_label_values(&[tag, "filter_block_read_count"]) + .unwrap() + .inc_by(ctx.filter_block_read_count); + v.get_metric_with_label_values(&[tag, "block_checksum_time"]) + .unwrap() + .inc_by(ctx.block_checksum_time); + v.get_metric_with_label_values(&[tag, "block_decompress_time"]) + .unwrap() + .inc_by(ctx.block_decompress_time); + v.get_metric_with_label_values(&[tag, "get_read_bytes"]) + .unwrap() + .inc_by(ctx.get_read_bytes); + v.get_metric_with_label_values(&[tag, "iter_read_bytes"]) + .unwrap() + .inc_by(ctx.iter_read_bytes); + v.get_metric_with_label_values(&[tag, "internal_key_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_key_skipped_count); + v.get_metric_with_label_values(&[tag, "internal_delete_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_delete_skipped_count); + v.get_metric_with_label_values(&[tag, "internal_recent_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_recent_skipped_count); + v.get_metric_with_label_values(&[tag, "get_snapshot_time"]) + .unwrap() + .inc_by(ctx.get_snapshot_time); + v.get_metric_with_label_values(&[tag, "get_from_memtable_time"]) + .unwrap() + .inc_by(ctx.get_from_memtable_time); + v.get_metric_with_label_values(&[tag, "get_from_memtable_count"]) + .unwrap() + .inc_by(ctx.get_from_memtable_count); + v.get_metric_with_label_values(&[tag, "get_post_process_time"]) + .unwrap() + .inc_by(ctx.get_post_process_time); + v.get_metric_with_label_values(&[tag, "get_from_output_files_time"]) + .unwrap() + .inc_by(ctx.get_from_output_files_time); + v.get_metric_with_label_values(&[tag, "seek_on_memtable_time"]) + .unwrap() + .inc_by(ctx.seek_on_memtable_time); + v.get_metric_with_label_values(&[tag, "seek_on_memtable_count"]) + .unwrap() + .inc_by(ctx.seek_on_memtable_count); + v.get_metric_with_label_values(&[tag, "next_on_memtable_count"]) + .unwrap() + .inc_by(ctx.next_on_memtable_count); + v.get_metric_with_label_values(&[tag, "prev_on_memtable_count"]) + .unwrap() + .inc_by(ctx.prev_on_memtable_count); + v.get_metric_with_label_values(&[tag, "seek_child_seek_time"]) + .unwrap() + .inc_by(ctx.seek_child_seek_time); + v.get_metric_with_label_values(&[tag, "seek_child_seek_count"]) + .unwrap() + .inc_by(ctx.seek_child_seek_count); + v.get_metric_with_label_values(&[tag, "seek_min_heap_time"]) + .unwrap() + .inc_by(ctx.seek_min_heap_time); + v.get_metric_with_label_values(&[tag, "seek_max_heap_time"]) + .unwrap() + .inc_by(ctx.seek_max_heap_time); + v.get_metric_with_label_values(&[tag, "seek_internal_seek_time"]) + .unwrap() + .inc_by(ctx.seek_internal_seek_time); + v.get_metric_with_label_values(&[tag, "db_mutex_lock_nanos"]) + .unwrap() + .inc_by(ctx.db_mutex_lock_nanos); + v.get_metric_with_label_values(&[tag, "db_condition_wait_nanos"]) + .unwrap() + .inc_by(ctx.db_condition_wait_nanos); + v.get_metric_with_label_values(&[tag, "read_index_block_nanos"]) + .unwrap() + .inc_by(ctx.read_index_block_nanos); + v.get_metric_with_label_values(&[tag, "read_filter_block_nanos"]) + .unwrap() + .inc_by(ctx.read_filter_block_nanos); + v.get_metric_with_label_values(&[tag, "new_table_block_iter_nanos"]) + .unwrap() + .inc_by(ctx.new_table_block_iter_nanos); + v.get_metric_with_label_values(&[tag, "new_table_iterator_nanos"]) + .unwrap() + .inc_by(ctx.new_table_iterator_nanos); + v.get_metric_with_label_values(&[tag, "block_seek_nanos"]) + .unwrap() + .inc_by(ctx.block_seek_nanos); + v.get_metric_with_label_values(&[tag, "find_table_nanos"]) + .unwrap() + .inc_by(ctx.find_table_nanos); + v.get_metric_with_label_values(&[tag, "bloom_memtable_hit_count"]) + .unwrap() + .inc_by(ctx.bloom_memtable_hit_count); + v.get_metric_with_label_values(&[tag, "bloom_memtable_miss_count"]) + .unwrap() + .inc_by(ctx.bloom_memtable_miss_count); + v.get_metric_with_label_values(&[tag, "bloom_sst_hit_count"]) + .unwrap() + .inc_by(ctx.bloom_sst_hit_count); + v.get_metric_with_label_values(&[tag, "bloom_sst_miss_count"]) + .unwrap() + .inc_by(ctx.bloom_sst_miss_count); + v.get_metric_with_label_values(&[tag, "get_cpu_nanos"]) + .unwrap() + .inc_by(ctx.get_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_next_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_next_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_prev_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_prev_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_seek_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_seek_cpu_nanos); + v.get_metric_with_label_values(&[tag, "encrypt_data_nanos"]) + .unwrap() + .inc_by(ctx.encrypt_data_nanos); + v.get_metric_with_label_values(&[tag, "decrypt_data_nanos"]) + .unwrap() + .inc_by(ctx.decrypt_data_nanos); + } +} + +pub trait PerfContextFields: Debug + Clone + Copy + Sub + slog::KV { + fn capture() -> Self; +} + +// TODO: PerfStatisticsInstant are leaked details of the underlying engine. +// It's better to clean up direct usages of it in TiKV except in tests. +// Switch to use the perf context of the engine_trait. +// +/// Store statistics we need. Data comes from RocksDB's `PerfContext`. +/// This statistics store instant values. +#[derive(Debug, Clone)] +pub struct PerfStatisticsInstant { + inner: P, + // The phantom is to make this type !Send and !Sync + _phantom: PhantomData<*const ()>, +} + +pub type ReadPerfInstant = PerfStatisticsInstant; +pub type WritePerfInstant = PerfStatisticsInstant; + +impl PerfStatisticsInstant

{ + pub fn new() -> Self { + Self { + inner: P::capture(), + _phantom: PhantomData, + } + } + + pub fn delta(&self) -> P { + P::capture() - self.inner + } +} + +impl Default for PerfStatisticsInstant

{ + fn default() -> Self { + Self::new() + } +} + +impl slog::KV for PerfStatisticsInstant

{ + fn serialize( + &self, + record: &::slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + slog::KV::serialize(&self.inner, record, serializer) + } +} + +impl PerfContextFields for ReadPerfContext { + fn capture() -> Self { + let perf_context = PerfContext::get(); + ReadPerfContext { + user_key_comparison_count: perf_context.user_key_comparison_count(), + block_cache_hit_count: perf_context.block_cache_hit_count(), + block_read_count: perf_context.block_read_count(), + block_read_byte: perf_context.block_read_byte(), + block_read_time: perf_context.block_read_time(), + block_cache_index_hit_count: perf_context.block_cache_index_hit_count(), + index_block_read_count: perf_context.index_block_read_count(), + block_cache_filter_hit_count: perf_context.block_cache_filter_hit_count(), + filter_block_read_count: perf_context.filter_block_read_count(), + block_checksum_time: perf_context.block_checksum_time(), + block_decompress_time: perf_context.block_decompress_time(), + get_read_bytes: perf_context.get_read_bytes(), + iter_read_bytes: perf_context.iter_read_bytes(), + internal_key_skipped_count: perf_context.internal_key_skipped_count(), + internal_delete_skipped_count: perf_context.internal_delete_skipped_count(), + internal_recent_skipped_count: perf_context.internal_recent_skipped_count(), + get_snapshot_time: perf_context.get_snapshot_time(), + get_from_memtable_time: perf_context.get_from_memtable_time(), + get_from_memtable_count: perf_context.get_from_memtable_count(), + get_post_process_time: perf_context.get_post_process_time(), + get_from_output_files_time: perf_context.get_from_output_files_time(), + seek_on_memtable_time: perf_context.seek_on_memtable_time(), + seek_on_memtable_count: perf_context.seek_on_memtable_count(), + next_on_memtable_count: perf_context.next_on_memtable_count(), + prev_on_memtable_count: perf_context.prev_on_memtable_count(), + seek_child_seek_time: perf_context.seek_child_seek_time(), + seek_child_seek_count: perf_context.seek_child_seek_count(), + seek_min_heap_time: perf_context.seek_min_heap_time(), + seek_max_heap_time: perf_context.seek_max_heap_time(), + seek_internal_seek_time: perf_context.seek_internal_seek_time(), + db_mutex_lock_nanos: perf_context.db_mutex_lock_nanos(), + db_condition_wait_nanos: perf_context.db_condition_wait_nanos(), + read_index_block_nanos: perf_context.read_index_block_nanos(), + read_filter_block_nanos: perf_context.read_filter_block_nanos(), + new_table_block_iter_nanos: perf_context.new_table_block_iter_nanos(), + new_table_iterator_nanos: perf_context.new_table_iterator_nanos(), + block_seek_nanos: perf_context.block_seek_nanos(), + find_table_nanos: perf_context.find_table_nanos(), + bloom_memtable_hit_count: perf_context.bloom_memtable_hit_count(), + bloom_memtable_miss_count: perf_context.bloom_memtable_miss_count(), + bloom_sst_hit_count: perf_context.bloom_sst_hit_count(), + bloom_sst_miss_count: perf_context.bloom_sst_miss_count(), + get_cpu_nanos: perf_context.get_cpu_nanos(), + iter_next_cpu_nanos: perf_context.iter_next_cpu_nanos(), + iter_prev_cpu_nanos: perf_context.iter_prev_cpu_nanos(), + iter_seek_cpu_nanos: perf_context.iter_seek_cpu_nanos(), + encrypt_data_nanos: perf_context.encrypt_data_nanos(), + decrypt_data_nanos: perf_context.decrypt_data_nanos(), + } + } +} + +impl PerfContextFields for WritePerfContext { + fn capture() -> Self { + let perf_context = PerfContext::get(); + WritePerfContext { + write_wal_time: perf_context.write_wal_time(), + pre_and_post_process: perf_context.write_pre_and_post_process_time(), + write_memtable_time: perf_context.write_memtable_time(), + write_thread_wait: perf_context.write_thread_wait_nanos(), + db_mutex_lock_nanos: perf_context.db_mutex_lock_nanos(), + write_scheduling_flushes_compactions_time: perf_context + .write_scheduling_flushes_compactions_time(), + db_condition_wait_nanos: perf_context.db_condition_wait_nanos(), + write_delay_time: perf_context.write_delay_time(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_field_operations() { + let f1 = ReadPerfContext { + internal_key_skipped_count: 1, + internal_delete_skipped_count: 2, + block_cache_hit_count: 3, + block_read_count: 4, + block_read_byte: 5, + ..Default::default() + }; + let f2 = ReadPerfContext { + internal_key_skipped_count: 2, + internal_delete_skipped_count: 3, + block_cache_hit_count: 5, + block_read_count: 7, + block_read_byte: 11, + ..Default::default() + }; + let f3 = f1 + f2; + assert_eq!(f3.internal_key_skipped_count, 3); + assert_eq!(f3.block_cache_hit_count, 8); + assert_eq!(f3.block_read_byte, 16); + + let mut f3 = f1; + f3 += f2; + assert_eq!(f3.internal_key_skipped_count, 3); + assert_eq!(f3.block_cache_hit_count, 8); + assert_eq!(f3.block_read_byte, 16); + + let f3 = f2 - f1; + assert_eq!(f3.internal_key_skipped_count, 1); + assert_eq!(f3.block_cache_hit_count, 2); + assert_eq!(f3.block_read_byte, 6); + + let mut f3 = f2; + f3 -= f1; + assert_eq!(f3.internal_key_skipped_count, 1); + assert_eq!(f3.block_cache_hit_count, 2); + assert_eq!(f3.block_read_byte, 6); + } + + #[test] + fn test_deref() { + let mut stats = ReadPerfContext { + internal_key_skipped_count: 1, + internal_delete_skipped_count: 2, + block_cache_hit_count: 3, + block_read_count: 4, + block_read_byte: 5, + ..Default::default() + }; + assert_eq!(stats.block_cache_hit_count, 3); + stats.block_cache_hit_count = 6; + assert_eq!(stats.block_cache_hit_count, 6); + } +} diff --git a/components/engine_tirocks/src/properties/mod.rs b/components/engine_tirocks/src/properties/mod.rs new file mode 100644 index 00000000000..967273aae3a --- /dev/null +++ b/components/engine_tirocks/src/properties/mod.rs @@ -0,0 +1,164 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod mvcc; +mod range; +mod table; +mod ttl; + +use std::{ + cmp, + collections::BTreeMap, + io::Read, + ops::{Deref, DerefMut}, +}; + +use codec::{ + number::NumberCodec, + prelude::{NumberDecoder, NumberEncoder}, +}; +use collections::HashMap; +use tirocks::properties::table::user::UserCollectedProperties; + +pub use self::{ + mvcc::MvccPropertiesCollectorFactory, + range::{RangeProperties, RangePropertiesCollectorFactory}, + table::{RocksTablePropertiesCollection, RocksUserCollectedProperties}, + ttl::TtlPropertiesCollectorFactory, +}; + +/// A struct to help collect properties. +/// +/// The properties of a file can be collected by ranges. Every range will be +/// referenced by a `PropIndex`. +#[derive(Clone, Debug, Default)] +pub struct PropIndex { + /// The properties calculated from the range. The range starts from + /// `offset` of previous `PropIndex` to this `offset`. How large the range + /// is depends on the implementation. + pub prop: u64, + /// The offset in the file. Offsets are not necessary the size of file. It + /// only makes sense to the implementations. + pub offset: u64, +} + +#[derive(Debug, Default)] +pub struct PropIndexes(BTreeMap, PropIndex>); + +impl Deref for PropIndexes { + type Target = BTreeMap, PropIndex>; + fn deref(&self) -> &BTreeMap, PropIndex> { + &self.0 + } +} + +impl DerefMut for PropIndexes { + fn deref_mut(&mut self) -> &mut BTreeMap, PropIndex> { + &mut self.0 + } +} + +impl PropIndexes { + pub fn new() -> PropIndexes { + PropIndexes(BTreeMap::new()) + } + + pub fn into_map(self) -> BTreeMap, PropIndex> { + self.0 + } + + pub fn add(&mut self, key: Vec, index: PropIndex) { + self.0.insert(key, index); + } + + // Format: | klen | k | v.size | v.offset | + pub fn encode(&self) -> Vec { + let cap = cmp::min((8 * 3 + 24) * self.0.len(), 1024); + let mut buf = Vec::with_capacity(cap); + for (k, v) in &self.0 { + buf.write_u64(k.len() as u64).unwrap(); + buf.extend(k); + buf.write_u64(v.prop).unwrap(); + buf.write_u64(v.offset).unwrap(); + } + buf + } + + pub fn decode(mut buf: &[u8]) -> codec::Result { + let mut res = BTreeMap::new(); + while !buf.is_empty() { + let klen = buf.read_u64()?; + let mut k = vec![0; klen as usize]; + buf.read_exact(&mut k)?; + let v = PropIndex { + prop: buf.read_u64()?, + offset: buf.read_u64()?, + }; + res.insert(k, v); + } + Ok(PropIndexes(res)) + } +} + +trait EncodeProperties { + fn encode(&mut self, name: &str, value: &[u8]); + + #[inline] + fn encode_u64(&mut self, name: &str, value: u64) { + let mut buf = [0; 8]; + NumberCodec::encode_u64(&mut buf, value); + self.encode(name, &buf); + } + + #[inline] + fn encode_indexes(&mut self, name: &str, indexes: &PropIndexes) { + self.encode(name, &indexes.encode()); + } +} + +impl EncodeProperties for UserCollectedProperties { + #[inline] + fn encode(&mut self, name: &str, value: &[u8]) { + self.add(name.as_bytes(), value); + } +} + +impl EncodeProperties for HashMap, Vec> { + #[inline] + fn encode(&mut self, name: &str, value: &[u8]) { + self.insert(name.as_bytes().to_owned(), value.to_owned()); + } +} + +trait DecodeProperties { + fn decode(&self, k: &str) -> codec::Result<&[u8]>; + + #[inline] + fn decode_u64(&self, k: &str) -> codec::Result { + let mut buf = self.decode(k)?; + buf.read_u64() + } + + #[inline] + fn decode_indexes(&self, k: &str) -> codec::Result { + let buf = self.decode(k)?; + PropIndexes::decode(buf) + } +} + +impl DecodeProperties for UserCollectedProperties { + #[inline] + fn decode(&self, k: &str) -> codec::Result<&[u8]> { + self.get(k.as_bytes()) + .ok_or_else(|| codec::ErrorInner::KeyNotFound.into()) + } +} + +impl DecodeProperties for HashMap, Vec> { + #[inline] + fn decode(&self, k: &str) -> codec::Result<&[u8]> { + match self.get(k.as_bytes()) { + Some(v) => Ok(v.as_slice()), + None => Err(codec::ErrorInner::KeyNotFound.into()), + } + } +} diff --git a/components/engine_tirocks/src/properties/mvcc.rs b/components/engine_tirocks/src/properties/mvcc.rs new file mode 100644 index 00000000000..1ca170f33d5 --- /dev/null +++ b/components/engine_tirocks/src/properties/mvcc.rs @@ -0,0 +1,364 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cmp, ffi::CStr}; + +use api_version::{ApiV2, KeyMode, KvFormat}; +use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties}; +use tirocks::properties::table::user::{ + Context, EntryType, SequenceNumber, TablePropertiesCollector, TablePropertiesCollectorFactory, + UserCollectedProperties, +}; +use txn_types::{Key, TimeStamp, Write, WriteType}; + +use super::{DecodeProperties, EncodeProperties, PropIndex, PropIndexes}; +use crate::RocksEngine; + +pub const PROP_NUM_ERRORS: &str = "tikv.num_errors"; +pub const PROP_MIN_TS: &str = "tikv.min_ts"; +pub const PROP_MAX_TS: &str = "tikv.max_ts"; +pub const PROP_NUM_ROWS: &str = "tikv.num_rows"; +pub const PROP_NUM_PUTS: &str = "tikv.num_puts"; +pub const PROP_NUM_DELETES: &str = "tikv.num_deletes"; +pub const PROP_NUM_VERSIONS: &str = "tikv.num_versions"; +pub const PROP_MAX_ROW_VERSIONS: &str = "tikv.max_row_versions"; +pub const PROP_ROWS_INDEX: &str = "tikv.rows_index"; +pub const PROP_ROWS_INDEX_DISTANCE: u64 = 10000; + +/// Can be used for write CF in TiDB & TxnKV scenario, or be used for default CF +/// in RawKV scenario. +pub struct MvccPropertiesCollector { + name: &'static CStr, + props: MvccProperties, + last_row: Vec, + num_errors: u64, + row_versions: u64, + cur_prop_index: PropIndex, + row_prop_indexes: PropIndexes, + key_mode: KeyMode, // Use KeyMode::Txn for both TiDB & TxnKV, KeyMode::Raw for RawKV. + current_ts: u64, +} + +impl MvccPropertiesCollector { + fn new(name: &'static CStr, key_mode: KeyMode) -> MvccPropertiesCollector { + MvccPropertiesCollector { + name, + props: MvccProperties::new(), + last_row: Vec::new(), + num_errors: 0, + row_versions: 0, + cur_prop_index: PropIndex::default(), + row_prop_indexes: PropIndexes::new(), + key_mode, + current_ts: ttl_current_ts(), + } + } + + fn finish(&mut self, properties: &mut impl EncodeProperties) { + // Insert last handle. + if self.cur_prop_index.prop > 0 { + self.row_prop_indexes + .insert(self.last_row.clone(), self.cur_prop_index.clone()); + } + encode_mvcc(&self.props, properties); + properties.encode_u64(PROP_NUM_ERRORS, self.num_errors); + properties.encode_indexes(PROP_ROWS_INDEX, &self.row_prop_indexes); + } +} + +impl TablePropertiesCollector for MvccPropertiesCollector { + fn name(&self) -> &CStr { + self.name + } + + fn add( + &mut self, + key: &[u8], + value: &[u8], + entry_type: EntryType, + _: SequenceNumber, + _: u64, + ) -> tirocks::Result<()> { + // TsFilter filters sst based on max_ts and min_ts during iterating. + // To prevent seeing outdated (GC) records, we should consider + // RocksDB delete entry type. + if entry_type != EntryType::kEntryPut && entry_type != EntryType::kEntryDelete { + return Ok(()); + } + + if !keys::validate_data_key(key) { + self.num_errors += 1; + return Ok(()); + } + + let (k, ts) = match Key::split_on_ts_for(key) { + Ok((k, ts)) => (k, ts), + Err(_) => { + self.num_errors += 1; + return Ok(()); + } + }; + + self.props.min_ts = cmp::min(self.props.min_ts, ts); + self.props.max_ts = cmp::max(self.props.max_ts, ts); + if entry_type == EntryType::kEntryDelete { + // Empty value for delete entry type, skip following properties. + return Ok(()); + } + + self.props.num_versions += 1; + + if k != self.last_row.as_slice() { + self.props.num_rows += 1; + self.row_versions = 1; + self.last_row.clear(); + self.last_row.extend(k); + } else { + self.row_versions += 1; + } + if self.row_versions > self.props.max_row_versions { + self.props.max_row_versions = self.row_versions; + } + + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + } + Err(_) => { + self.num_errors += 1; + } + } + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return Ok(()); + } + }; + + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} + } + } + + // Add new row. + if self.row_versions == 1 { + self.cur_prop_index.prop += 1; + self.cur_prop_index.offset += 1; + if self.cur_prop_index.offset == 1 + || self.cur_prop_index.prop >= PROP_ROWS_INDEX_DISTANCE + { + self.row_prop_indexes + .insert(self.last_row.clone(), self.cur_prop_index.clone()); + self.cur_prop_index.prop = 0; + } + } + Ok(()) + } + + fn finish(&mut self, properties: &mut UserCollectedProperties) -> tirocks::Result<()> { + self.finish(properties); + Ok(()) + } +} + +/// Can be used for write CF of TiDB/TxnKV, default CF of RawKV. +pub struct MvccPropertiesCollectorFactory { + name: &'static CStr, + key_mode: KeyMode, +} + +impl Default for MvccPropertiesCollectorFactory { + fn default() -> Self { + Self { + name: CStr::from_bytes_with_nul(b"tikv.mvcc-properties-collector\0").unwrap(), + key_mode: KeyMode::Txn, + } + } +} + +impl MvccPropertiesCollectorFactory { + pub fn rawkv() -> Self { + Self { + name: CStr::from_bytes_with_nul(b"tikv.rawkv-mvcc-properties-collector\0").unwrap(), + key_mode: KeyMode::Raw, + } + } +} + +impl TablePropertiesCollectorFactory for MvccPropertiesCollectorFactory { + type Collector = MvccPropertiesCollector; + + fn name(&self) -> &CStr { + self.name + } + + fn create_table_properties_collector(&self, _: Context) -> Self::Collector { + MvccPropertiesCollector::new(self.name, self.key_mode) + } +} + +fn encode_mvcc(mvcc_props: &MvccProperties, props: &mut impl EncodeProperties) { + props.encode_u64(PROP_MIN_TS, mvcc_props.min_ts.into_inner()); + props.encode_u64(PROP_MAX_TS, mvcc_props.max_ts.into_inner()); + props.encode_u64(PROP_NUM_ROWS, mvcc_props.num_rows); + props.encode_u64(PROP_NUM_PUTS, mvcc_props.num_puts); + props.encode_u64(PROP_NUM_DELETES, mvcc_props.num_deletes); + props.encode_u64(PROP_NUM_VERSIONS, mvcc_props.num_versions); + props.encode_u64(PROP_MAX_ROW_VERSIONS, mvcc_props.max_row_versions); +} + +pub(super) fn decode_mvcc(props: &impl DecodeProperties) -> codec::Result { + let mut res = MvccProperties::new(); + res.min_ts = props.decode_u64(PROP_MIN_TS)?.into(); + res.max_ts = props.decode_u64(PROP_MAX_TS)?.into(); + res.num_rows = props.decode_u64(PROP_NUM_ROWS)?; + res.num_puts = props.decode_u64(PROP_NUM_PUTS)?; + res.num_versions = props.decode_u64(PROP_NUM_VERSIONS)?; + // To be compatible with old versions. + res.num_deletes = props + .decode_u64(PROP_NUM_DELETES) + .unwrap_or(res.num_versions - res.num_puts); + res.max_row_versions = props.decode_u64(PROP_MAX_ROW_VERSIONS)?; + Ok(res) +} + +impl engine_traits::MvccPropertiesExt for RocksEngine { + fn get_mvcc_properties_cf( + &self, + cf: &str, + safe_point: TimeStamp, + start_key: &[u8], + end_key: &[u8], + ) -> Option { + let collection = match self.range_properties(cf, start_key, end_key) { + Ok(c) if !c.is_empty() => c, + _ => return None, + }; + let mut props = MvccProperties::new(); + for (_, v) in &*collection { + let mvcc = match decode_mvcc(v.user_collected_properties()) { + Ok(m) => m, + Err(_) => return None, + }; + // Filter out properties after safe_point. + if mvcc.min_ts > safe_point { + continue; + } + props.add(&mvcc); + } + Some(props) + } +} + +#[cfg(test)] +mod tests { + use api_version::RawValue; + use collections::HashMap; + use test::Bencher; + use txn_types::{Key, Write, WriteType}; + + use super::*; + + #[test] + fn test_mvcc_properties() { + let cases = [ + ("ab", 2, WriteType::Put, EntryType::kEntryPut), + ("ab", 1, WriteType::Delete, EntryType::kEntryPut), + ("ab", 1, WriteType::Delete, EntryType::kEntryDelete), + ("cd", 5, WriteType::Delete, EntryType::kEntryPut), + ("cd", 4, WriteType::Put, EntryType::kEntryPut), + ("cd", 3, WriteType::Put, EntryType::kEntryPut), + ("ef", 6, WriteType::Put, EntryType::kEntryPut), + ("ef", 6, WriteType::Put, EntryType::kEntryDelete), + ("gh", 7, WriteType::Delete, EntryType::kEntryPut), + ]; + let mut collector = + MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Txn); + for &(key, ts, write_type, entry_type) in &cases { + let ts = ts.into(); + let k = Key::from_raw(key.as_bytes()).append_ts(ts); + let k = keys::data_key(k.as_encoded()); + let v = Write::new(write_type, ts, None).as_ref().to_bytes(); + collector.add(&k, &v, entry_type, 0, 0).unwrap(); + } + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = decode_mvcc(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + } + + #[test] + fn test_mvcc_properties_rawkv_mode() { + let test_raws = vec![ + (b"r\0a", 1, false, u64::MAX), + (b"r\0a", 5, false, u64::MAX), + (b"r\0a", 7, false, u64::MAX), + (b"r\0b", 1, false, u64::MAX), + (b"r\0b", 1, true, u64::MAX), + (b"r\0c", 1, true, 10), + (b"r\0d", 1, true, 10), + ]; + + let mut collector = + MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Raw); + for &(key, ts, is_delete, expire_ts) in &test_raws { + let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); + let k = keys::data_key(encode_key.as_encoded()); + let v = ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(expire_ts), + is_delete, + }); + collector.add(&k, &v, EntryType::kEntryPut, 0, 0).unwrap(); + } + + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = decode_mvcc(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_deletes, 3); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + } + + #[bench] + fn bench_mvcc_properties(b: &mut Bencher) { + let ts = 1.into(); + let num_entries = 100; + let mut entries = Vec::new(); + for i in 0..num_entries { + let s = format!("{:032}", i); + let k = Key::from_raw(s.as_bytes()).append_ts(ts); + let k = keys::data_key(k.as_encoded()); + let w = Write::new(WriteType::Put, ts, Some(s.as_bytes().to_owned())); + entries.push((k, w.as_ref().to_bytes())); + } + + let mut collector = + MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Txn); + b.iter(|| { + for &(ref k, ref v) in &entries { + collector.add(k, v, EntryType::kEntryPut, 0, 0).unwrap(); + } + }); + } +} diff --git a/components/engine_tirocks/src/properties/range.rs b/components/engine_tirocks/src/properties/range.rs new file mode 100644 index 00000000000..59b9e68a6bb --- /dev/null +++ b/components/engine_tirocks/src/properties/range.rs @@ -0,0 +1,803 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ffi::CStr, io::Read, path::Path}; + +use codec::prelude::{NumberDecoder, NumberEncoder}; +use engine_traits::{MvccProperties, Range, Result, CF_DEFAULT, CF_LOCK, CF_WRITE, LARGE_CFS}; +use tikv_util::{box_err, box_try, debug, info}; +use tirocks::{ + properties::table::user::{ + Context, EntryType, SequenceNumber, TablePropertiesCollector, + TablePropertiesCollectorFactory, UserCollectedProperties, + }, + titan::TitanBlobIndex, +}; + +use super::{mvcc::decode_mvcc, DecodeProperties, EncodeProperties, PropIndexes}; +use crate::RocksEngine; + +const PROP_TOTAL_SIZE: &str = "tikv.total_size"; +const PROP_SIZE_INDEX: &str = "tikv.size_index"; +const PROP_RANGE_INDEX: &str = "tikv.range_index"; +pub const DEFAULT_PROP_SIZE_INDEX_DISTANCE: u64 = 4 * 1024 * 1024; +pub const DEFAULT_PROP_KEYS_INDEX_DISTANCE: u64 = 40 * 1024; + +// Deprecated. Only for compatible issue from v2.0 or older version. +#[derive(Debug, Default)] +pub struct SizeProperties { + pub total_size: u64, + pub prop_indexes: PropIndexes, +} + +impl SizeProperties { + fn decode(props: &impl DecodeProperties) -> codec::Result { + Ok(SizeProperties { + total_size: props.decode_u64(PROP_TOTAL_SIZE)?, + prop_indexes: props.decode_indexes(PROP_SIZE_INDEX)?, + }) + } +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct RangeOffsets { + pub size: u64, + pub keys: u64, +} + +#[derive(Debug, Default)] +pub struct RangeProperties { + pub offsets: Vec<(Vec, RangeOffsets)>, +} + +impl RangeProperties { + pub fn get(&self, key: &[u8]) -> &RangeOffsets { + let idx = self + .offsets + .binary_search_by_key(&key, |&(ref k, _)| k) + .unwrap(); + &self.offsets[idx].1 + } + + fn encode(&self, props: &mut impl EncodeProperties) { + let mut buf = Vec::with_capacity(1024); + for (k, offsets) in &self.offsets { + buf.write_u64(k.len() as u64).unwrap(); + buf.extend(k); + buf.write_u64(offsets.size).unwrap(); + buf.write_u64(offsets.keys).unwrap(); + } + props.encode(PROP_RANGE_INDEX, &buf); + } + + pub(super) fn decode(props: &impl DecodeProperties) -> codec::Result { + match RangeProperties::decode_from_range_properties(props) { + Ok(res) => return Ok(res), + Err(e) => info!( + "decode to RangeProperties failed with err: {:?}, try to decode to SizeProperties, maybe upgrade from v2.0 or older version?", + e + ), + } + SizeProperties::decode(props).map(|res| res.into()) + } + + fn decode_from_range_properties( + props: &impl DecodeProperties, + ) -> codec::Result { + let mut res = RangeProperties::default(); + let mut buf = props.decode(PROP_RANGE_INDEX)?; + while !buf.is_empty() { + let klen = buf.read_u64()?; + let mut k = vec![0; klen as usize]; + buf.read_exact(&mut k)?; + let offsets = RangeOffsets { + size: buf.read_u64()?, + keys: buf.read_u64()?, + }; + res.offsets.push((k, offsets)); + } + Ok(res) + } + + pub fn get_approximate_size_in_range(&self, start: &[u8], end: &[u8]) -> u64 { + self.get_approximate_distance_in_range(start, end).0 + } + + pub fn get_approximate_keys_in_range(&self, start: &[u8], end: &[u8]) -> u64 { + self.get_approximate_distance_in_range(start, end).1 + } + + /// Returns `size` and `keys`. + pub fn get_approximate_distance_in_range(&self, start: &[u8], end: &[u8]) -> (u64, u64) { + assert!(start <= end); + if start == end { + return (0, 0); + } + let start_offset = match self.offsets.binary_search_by_key(&start, |&(ref k, _)| k) { + Ok(idx) => Some(idx), + Err(next_idx) => next_idx.checked_sub(1), + }; + let end_offset = match self.offsets.binary_search_by_key(&end, |&(ref k, _)| k) { + Ok(idx) => Some(idx), + Err(next_idx) => next_idx.checked_sub(1), + }; + let start = start_offset.map_or_else(|| Default::default(), |x| self.offsets[x].1); + let end = end_offset.map_or_else(|| Default::default(), |x| self.offsets[x].1); + assert!(end.size >= start.size && end.keys >= start.keys); + (end.size - start.size, end.keys - start.keys) + } + + // equivalent to range(Excluded(start_key), Excluded(end_key)) + pub fn take_excluded_range( + mut self, + start_key: &[u8], + end_key: &[u8], + ) -> Vec<(Vec, RangeOffsets)> { + let start_offset = match self + .offsets + .binary_search_by_key(&start_key, |&(ref k, _)| k) + { + Ok(idx) => { + if idx == self.offsets.len() - 1 { + return vec![]; + } else { + idx + 1 + } + } + Err(next_idx) => next_idx, + }; + + let end_offset = match self.offsets.binary_search_by_key(&end_key, |&(ref k, _)| k) { + Ok(idx) => { + if idx == 0 { + return vec![]; + } else { + idx - 1 + } + } + Err(next_idx) => { + if next_idx == 0 { + return vec![]; + } else { + next_idx - 1 + } + } + }; + + if start_offset > end_offset { + return vec![]; + } + + self.offsets.drain(start_offset..=end_offset).collect() + } + + pub fn smallest_key(&self) -> Option> { + self.offsets.first().map(|(k, _)| k.to_owned()) + } + + pub fn largest_key(&self) -> Option> { + self.offsets.last().map(|(k, _)| k.to_owned()) + } +} + +impl From for RangeProperties { + fn from(p: SizeProperties) -> RangeProperties { + let mut res = RangeProperties::default(); + for (key, size_index) in p.prop_indexes.into_map() { + let range = RangeOffsets { + // For SizeProperties, the offset is accumulation of the size. + size: size_index.offset, + ..Default::default() + }; + res.offsets.push((key, range)); + } + res + } +} + +fn range_properties_collector_name() -> &'static CStr { + CStr::from_bytes_with_nul(b"tikv.range-properties-collector\0").unwrap() +} + +pub struct RangePropertiesCollector { + props: RangeProperties, + last_offsets: RangeOffsets, + last_key: Vec, + cur_offsets: RangeOffsets, + prop_size_index_distance: u64, + prop_keys_index_distance: u64, +} + +impl Default for RangePropertiesCollector { + fn default() -> Self { + RangePropertiesCollector { + props: RangeProperties::default(), + last_offsets: RangeOffsets::default(), + last_key: vec![], + cur_offsets: RangeOffsets::default(), + prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, + prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, + } + } +} + +impl RangePropertiesCollector { + pub fn new(prop_size_index_distance: u64, prop_keys_index_distance: u64) -> Self { + RangePropertiesCollector { + prop_size_index_distance, + prop_keys_index_distance, + ..Default::default() + } + } + + #[inline] + fn size_in_last_range(&self) -> u64 { + self.cur_offsets.size - self.last_offsets.size + } + + #[inline] + fn keys_in_last_range(&self) -> u64 { + self.cur_offsets.keys - self.last_offsets.keys + } + + #[inline] + fn insert_new_point(&mut self, key: Vec) { + self.last_offsets = self.cur_offsets; + self.props.offsets.push((key, self.cur_offsets)); + } + + #[inline] + fn finish(&mut self, props: &mut impl EncodeProperties) { + if self.size_in_last_range() > 0 || self.keys_in_last_range() > 0 { + let key = self.last_key.clone(); + self.insert_new_point(key); + } + self.props.encode(props); + } +} + +impl TablePropertiesCollector for RangePropertiesCollector { + #[inline] + fn name(&self) -> &CStr { + range_properties_collector_name() + } + + #[inline] + fn add( + &mut self, + key: &[u8], + value: &[u8], + entry_type: EntryType, + _: SequenceNumber, + _: u64, + ) -> tirocks::Result<()> { + // size + let entry_size = match entry_type { + EntryType::kEntryPut => value.len() as u64, + EntryType::kEntryBlobIndex => match TitanBlobIndex::decode(value) { + Ok(index) => index.blob_size + value.len() as u64, + // Perhaps should panic? + Err(_) => return Ok(()), + }, + _ => return Ok(()), + }; + self.cur_offsets.size += entry_size + key.len() as u64; + // keys + self.cur_offsets.keys += 1; + // Add the start key for convenience. + if self.last_key.is_empty() + || self.size_in_last_range() >= self.prop_size_index_distance + || self.keys_in_last_range() >= self.prop_keys_index_distance + { + self.insert_new_point(key.to_owned()); + } + self.last_key.clear(); + self.last_key.extend_from_slice(key); + Ok(()) + } + + #[inline] + fn finish(&mut self, prop: &mut UserCollectedProperties) -> tirocks::Result<()> { + self.finish(prop); + Ok(()) + } +} + +pub struct RangePropertiesCollectorFactory { + pub prop_size_index_distance: u64, + pub prop_keys_index_distance: u64, +} + +impl Default for RangePropertiesCollectorFactory { + #[inline] + fn default() -> Self { + RangePropertiesCollectorFactory { + prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, + prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, + } + } +} + +impl TablePropertiesCollectorFactory for RangePropertiesCollectorFactory { + type Collector = RangePropertiesCollector; + + #[inline] + fn name(&self) -> &CStr { + range_properties_collector_name() + } + + #[inline] + fn create_table_properties_collector(&self, _: Context) -> RangePropertiesCollector { + RangePropertiesCollector::new(self.prop_size_index_distance, self.prop_keys_index_distance) + } +} + +fn get_range_entries_and_versions( + engine: &crate::RocksEngine, + cf: &str, + start: &[u8], + end: &[u8], +) -> Option<(u64, u64)> { + let collection = match engine.properties_of_tables_in_range(cf, &[(start, end)]) { + Ok(v) => v, + Err(_) => return None, + }; + + if collection.is_empty() { + return None; + } + + // Aggregate total MVCC properties and total number entries. + let mut props = MvccProperties::new(); + let mut num_entries = 0; + for (_, v) in &*collection { + let mvcc = match decode_mvcc(v.user_collected_properties()) { + Ok(v) => v, + Err(_) => return None, + }; + num_entries += v.num_entries(); + props.add(&mvcc); + } + + Some((num_entries, props.num_versions)) +} + +impl engine_traits::RangePropertiesExt for RocksEngine { + fn get_range_approximate_keys(&self, range: Range<'_>, large_threshold: u64) -> Result { + // try to get from RangeProperties first. + match self.get_range_approximate_keys_cf(CF_WRITE, range, large_threshold) { + Ok(v) => { + return Ok(v); + } + Err(e) => debug!( + "failed to get keys from RangeProperties"; + "err" => ?e, + ), + } + + let start = &range.start_key; + let end = &range.end_key; + let (_, keys) = + get_range_entries_and_versions(self, CF_WRITE, start, end).unwrap_or_default(); + Ok(keys) + } + + fn get_range_approximate_keys_cf( + &self, + cfname: &str, + range: Range<'_>, + large_threshold: u64, + ) -> Result { + let start_key = &range.start_key; + let end_key = &range.end_key; + let mut total_keys = 0; + let (mem_keys, _) = + self.approximate_memtable_stats(cfname, range.start_key, range.end_key)?; + total_keys += mem_keys; + + let collection = box_try!(self.range_properties(cfname, start_key, end_key)); + for (_, v) in &*collection { + let props = box_try!(RangeProperties::decode(v.user_collected_properties())); + total_keys += props.get_approximate_keys_in_range(start_key, end_key); + } + + if large_threshold != 0 && total_keys > large_threshold { + let ssts = collection + .into_iter() + .map(|(k, v)| { + let props = RangeProperties::decode(v.user_collected_properties()).unwrap(); + let keys = props.get_approximate_keys_in_range(start_key, end_key); + let p = std::str::from_utf8(k).unwrap(); + format!( + "{}:{}", + Path::new(p) + .file_name() + .map(|f| f.to_str().unwrap()) + .unwrap_or(p), + keys + ) + }) + .collect::>() + .join(", "); + info!( + "range contains too many keys"; + "start" => log_wrappers::Value::key(range.start_key), + "end" => log_wrappers::Value::key(range.end_key), + "total_keys" => total_keys, + "memtable" => mem_keys, + "ssts_keys" => ssts, + "cf" => cfname, + ) + } + Ok(total_keys) + } + + fn get_range_approximate_size(&self, range: Range<'_>, large_threshold: u64) -> Result { + let mut size = 0; + for cf in LARGE_CFS { + size += self + .get_range_approximate_size_cf(cf, range, large_threshold) + // CF_LOCK doesn't have RangeProperties until v4.0, so we swallow the error for + // backward compatibility. + .or_else(|e| if cf == &CF_LOCK { Ok(0) } else { Err(e) })?; + } + Ok(size) + } + + fn get_range_approximate_size_cf( + &self, + cf: &str, + range: Range<'_>, + large_threshold: u64, + ) -> Result { + let start_key = &range.start_key; + let end_key = &range.end_key; + let mut total_size = 0; + let (_, mem_size) = self.approximate_memtable_stats(cf, range.start_key, range.end_key)?; + total_size += mem_size; + + let collection = box_try!(self.range_properties(cf, start_key, end_key)); + for (_, v) in &*collection { + let props = box_try!(RangeProperties::decode(v.user_collected_properties())); + total_size += props.get_approximate_size_in_range(start_key, end_key); + } + + if large_threshold != 0 && total_size > large_threshold { + let ssts = collection + .into_iter() + .map(|(k, v)| { + let props = RangeProperties::decode(v.user_collected_properties()).unwrap(); + let size = props.get_approximate_size_in_range(start_key, end_key); + let p = std::str::from_utf8(k).unwrap(); + format!( + "{}:{}", + Path::new(p) + .file_name() + .map(|f| f.to_str().unwrap()) + .unwrap_or(p), + size + ) + }) + .collect::>() + .join(", "); + info!( + "range size is too large"; + "start" => log_wrappers::Value::key(range.start_key), + "end" => log_wrappers::Value::key(range.end_key), + "total_size" => total_size, + "memtable" => mem_size, + "ssts_size" => ssts, + "cf" => cf, + ) + } + Ok(total_size) + } + + fn get_range_approximate_split_keys( + &self, + range: Range<'_>, + key_count: usize, + ) -> Result>> { + let get_cf_size = |cf: &str| self.get_range_approximate_size_cf(cf, range, 0); + let cfs = [ + (CF_DEFAULT, box_try!(get_cf_size(CF_DEFAULT))), + (CF_WRITE, box_try!(get_cf_size(CF_WRITE))), + // CF_LOCK doesn't have RangeProperties until v4.0, so we swallow the error for + // backward compatibility. + (CF_LOCK, get_cf_size(CF_LOCK).unwrap_or(0)), + ]; + + let total_size: u64 = cfs.iter().map(|(_, s)| s).sum(); + if total_size == 0 { + return Err(box_err!("all CFs are empty")); + } + + let (cf, _) = cfs.iter().max_by_key(|(_, s)| s).unwrap(); + + self.get_range_approximate_split_keys_cf(cf, range, key_count) + } + + fn get_range_approximate_split_keys_cf( + &self, + cfname: &str, + range: Range<'_>, + key_count: usize, + ) -> Result>> { + let start_key = &range.start_key; + let end_key = &range.end_key; + let collection = box_try!(self.range_properties(cfname, start_key, end_key)); + + let mut keys = vec![]; + for (_, v) in &*collection { + let props = box_try!(RangeProperties::decode(v.user_collected_properties())); + keys.extend( + props + .take_excluded_range(start_key, end_key) + .into_iter() + .map(|(k, _)| k), + ); + } + + if keys.is_empty() { + return Ok(vec![]); + } + + const SAMPLING_THRESHOLD: usize = 20000; + const SAMPLE_RATIO: usize = 1000; + // If there are too many keys, reduce its amount before sorting, or it may take + // too much time to sort the keys. + if keys.len() > SAMPLING_THRESHOLD { + let len = keys.len(); + keys = keys.into_iter().step_by(len / SAMPLE_RATIO).collect(); + } + keys.sort(); + + // If the keys are too few, return them directly. + if keys.len() <= key_count { + return Ok(keys); + } + + // Find `key_count` keys which divides the whole range into `parts` parts + // evenly. + let mut res = Vec::with_capacity(key_count); + let section_len = (keys.len() as f64) / ((key_count + 1) as f64); + for i in 1..=key_count { + res.push(keys[(section_len * (i as f64)) as usize].clone()) + } + res.dedup(); + Ok(res) + } +} + +#[cfg(test)] +mod tests { + use collections::HashMap; + use engine_traits::{SyncMutable, CF_WRITE, LARGE_CFS}; + use rand::Rng; + use tempfile::Builder; + use tirocks::properties::table::user::SysTablePropertiesCollectorFactory; + use txn_types::Key; + + use super::*; + use crate::{ + cf_options::RocksCfOptions, db_options::RocksDbOptions, + properties::mvcc::MvccPropertiesCollectorFactory, + }; + + #[allow(clippy::many_single_char_names)] + #[test] + fn test_range_properties() { + let cases = [ + ("a", 0, 1), + // handle "a": size(size = 1, offset = 1),keys(1,1) + ("b", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), + ("c", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), + ("d", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + ("e", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), + // handle "e": size(size = DISTANCE + 4, offset = DISTANCE + 5),keys(4,5) + ("f", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), + ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), + ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) + ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) + ("l", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), + ("m", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), + // handle "m": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE,offset = + // 11+DEFAULT_PROP_KEYS_INDEX_DISTANCE + ("n", 1, DEFAULT_PROP_KEYS_INDEX_DISTANCE), + // handle "n": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE, offset = + // 11+2*DEFAULT_PROP_KEYS_INDEX_DISTANCE + ("o", 1, 1), + // handle "o": keys = 1, offset = 12 + 2*DEFAULT_PROP_KEYS_INDEX_DISTANCE + ]; + + let mut collector = RangePropertiesCollector::default(); + for &(k, vlen, count) in &cases { + let v = vec![0; vlen as usize]; + for _ in 0..count { + collector + .add(k.as_bytes(), &v, EntryType::kEntryPut, 0, 0) + .unwrap(); + } + } + for &(k, vlen, _) in &cases { + let v = vec![0; vlen as usize]; + collector + .add(k.as_bytes(), &v, EntryType::kEntryOther, 0, 0) + .unwrap(); + } + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = RangeProperties::decode(&result).unwrap(); + assert_eq!(props.smallest_key().unwrap(), cases[0].0.as_bytes()); + assert_eq!( + props.largest_key().unwrap(), + cases[cases.len() - 1].0.as_bytes() + ); + assert_eq!( + props.get_approximate_size_in_range(b"", b"k"), + DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 25 + 11 + ); + assert_eq!(props.get_approximate_keys_in_range(b"", b"k"), 11_u64); + + assert_eq!(props.offsets.len(), 7); + let a = props.get(b"a"); + assert_eq!(a.size, 1); + let e = props.get(b"e"); + assert_eq!(e.size, DEFAULT_PROP_SIZE_INDEX_DISTANCE + 5); + let i = props.get(b"i"); + assert_eq!(i.size, DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 17 + 9); + let k = props.get(b"k"); + assert_eq!(k.size, DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 25 + 11); + let m = props.get(b"m"); + assert_eq!(m.keys, 11 + DEFAULT_PROP_KEYS_INDEX_DISTANCE); + let n = props.get(b"n"); + assert_eq!(n.keys, 11 + 2 * DEFAULT_PROP_KEYS_INDEX_DISTANCE); + let o = props.get(b"o"); + assert_eq!(o.keys, 12 + 2 * DEFAULT_PROP_KEYS_INDEX_DISTANCE); + let empty = RangeOffsets::default(); + let cases = [ + (" ", "k", k, &empty, 3), + (" ", " ", &empty, &empty, 0), + ("k", "k", k, k, 0), + ("a", "k", k, a, 2), + ("a", "i", i, a, 1), + ("e", "h", e, e, 0), + ("b", "h", e, a, 1), + ("g", "g", i, i, 0), + ]; + for &(start, end, end_idx, start_idx, count) in &cases { + let props = RangeProperties::decode(&result).unwrap(); + let size = end_idx.size - start_idx.size; + assert_eq!( + props.get_approximate_size_in_range(start.as_bytes(), end.as_bytes()), + size + ); + let keys = end_idx.keys - start_idx.keys; + assert_eq!( + props.get_approximate_keys_in_range(start.as_bytes(), end.as_bytes()), + keys + ); + assert_eq!( + props + .take_excluded_range(start.as_bytes(), end.as_bytes()) + .len(), + count + ); + } + } + + #[test] + fn test_range_properties_with_blob_index() { + let cases = [ + ("a", 0), + // handle "a": size(size = 1, offset = 1),keys(1,1) + ("b", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), + ("c", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), + ("d", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + ("e", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), + // handle "e": size(size = DISTANCE + 4, offset = DISTANCE + 5),keys(4,5) + ("f", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), + ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), + ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) + ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) + ]; + + let handles = ["a", "e", "i", "k"]; + + let mut rng = rand::thread_rng(); + let mut collector = RangePropertiesCollector::default(); + let mut extra_value_size: u64 = 0; + for &(k, vlen) in &cases { + if handles.contains(&k) || rng.gen_range(0..2) == 0 { + let v = vec![0; vlen as usize - extra_value_size as usize]; + extra_value_size = 0; + collector + .add(k.as_bytes(), &v, EntryType::kEntryPut, 0, 0) + .unwrap(); + } else { + let blob_index = TitanBlobIndex::new(0, vlen - extra_value_size, 0); + let v = blob_index.encode(); + extra_value_size = v.len() as u64; + collector + .add(k.as_bytes(), &v, EntryType::kEntryBlobIndex, 0, 0) + .unwrap(); + } + } + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = RangeProperties::decode(&result).unwrap(); + assert_eq!(props.smallest_key().unwrap(), cases[0].0.as_bytes()); + assert_eq!( + props.largest_key().unwrap(), + cases[cases.len() - 1].0.as_bytes() + ); + assert_eq!( + props.get_approximate_size_in_range(b"e", b"i"), + DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 9 + 4 + ); + assert_eq!( + props.get_approximate_size_in_range(b"", b"k"), + DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 25 + 11 + ); + } + + #[test] + fn test_get_range_entries_and_versions() { + let path = Builder::new() + .prefix("_test_get_range_entries_and_versions") + .tempdir() + .unwrap(); + let db_opts = RocksDbOptions::default(); + let cfs_opts = LARGE_CFS + .iter() + .map(|cf| { + let mut cf_opts = RocksCfOptions::default(); + cf_opts + .set_level0_file_num_compaction_trigger(10) + .add_table_properties_collector_factory( + &SysTablePropertiesCollectorFactory::new( + MvccPropertiesCollectorFactory::default(), + ), + ); + (*cf, cf_opts) + }) + .collect(); + let db = crate::util::new_engine_opt(path.path(), db_opts, cfs_opts).unwrap(); + + let cases = ["a", "b", "c"]; + for &key in &cases { + let k1 = keys::data_key( + Key::from_raw(key.as_bytes()) + .append_ts(2.into()) + .as_encoded(), + ); + db.put_cf(CF_WRITE, &k1, b"v1").unwrap(); + db.delete_cf(CF_WRITE, &k1).unwrap(); + let key = keys::data_key( + Key::from_raw(key.as_bytes()) + .append_ts(3.into()) + .as_encoded(), + ); + db.put_cf(CF_WRITE, &key, b"v2").unwrap(); + db.flush(CF_WRITE, true).unwrap(); + } + + let start_keys = keys::data_key(&[]); + let end_keys = keys::data_end_key(&[]); + let (entries, versions) = + get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); + assert_eq!(entries, (cases.len() * 2) as u64); + assert_eq!(versions, cases.len() as u64); + } +} diff --git a/components/engine_tirocks/src/properties/table.rs b/components/engine_tirocks/src/properties/table.rs new file mode 100644 index 00000000000..84998bbeb88 --- /dev/null +++ b/components/engine_tirocks/src/properties/table.rs @@ -0,0 +1,96 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::mem; + +use engine_traits::{Range, Result}; +use tirocks::properties::table::{ + builtin::OwnedTablePropertiesCollection, user::UserCollectedProperties, +}; + +use super::range::RangeProperties; +use crate::{r2e, RocksEngine}; + +#[repr(transparent)] +pub struct RocksUserCollectedProperties(UserCollectedProperties); + +impl RocksUserCollectedProperties { + #[inline] + fn from_rocks(v: &UserCollectedProperties) -> &Self { + unsafe { mem::transmute(v) } + } +} + +impl engine_traits::UserCollectedProperties for RocksUserCollectedProperties { + #[inline] + fn get(&self, index: &[u8]) -> Option<&[u8]> { + self.0.get(index) + } + + #[inline] + fn approximate_size_and_keys(&self, start: &[u8], end: &[u8]) -> Option<(usize, usize)> { + let rp = RangeProperties::decode(&self.0).ok()?; + let x = rp.get_approximate_distance_in_range(start, end); + Some((x.0 as usize, x.1 as usize)) + } +} + +#[repr(transparent)] +pub struct RocksTablePropertiesCollection(OwnedTablePropertiesCollection); + +impl engine_traits::TablePropertiesCollection for RocksTablePropertiesCollection { + type UserCollectedProperties = RocksUserCollectedProperties; + + #[inline] + fn iter_user_collected_properties(&self, mut f: F) + where + F: FnMut(&Self::UserCollectedProperties) -> bool, + { + for (_, props) in &*self.0 { + let props = props.user_collected_properties(); + if !f(RocksUserCollectedProperties::from_rocks(props)) { + break; + } + } + } +} + +impl engine_traits::TablePropertiesExt for RocksEngine { + type TablePropertiesCollection = RocksTablePropertiesCollection; + + fn table_properties_collection( + &self, + cf: &str, + ranges: &[Range<'_>], + ) -> Result { + // FIXME: extra allocation + let ranges: Vec<_> = ranges.iter().map(|r| (r.start_key, r.end_key)).collect(); + let collection = self.properties_of_tables_in_range(cf, &ranges)?; + Ok(RocksTablePropertiesCollection(collection)) + } +} + +impl RocksEngine { + #[inline] + pub(crate) fn properties_of_tables_in_range( + &self, + cf: &str, + ranges: &[(&[u8], &[u8])], + ) -> Result { + let handle = self.cf(cf)?; + let mut c = OwnedTablePropertiesCollection::default(); + self.as_inner() + .properties_of_tables_in_range(handle, ranges, &mut c) + .map_err(r2e)?; + Ok(c) + } + + #[inline] + pub fn range_properties( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> Result { + self.properties_of_tables_in_range(cf, &[(start_key, end_key)]) + } +} diff --git a/components/engine_tirocks/src/properties/ttl.rs b/components/engine_tirocks/src/properties/ttl.rs new file mode 100644 index 00000000000..c4190fe59bd --- /dev/null +++ b/components/engine_tirocks/src/properties/ttl.rs @@ -0,0 +1,225 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ffi::CStr, marker::PhantomData}; + +use api_version::{KeyMode, KvFormat, RawValue}; +use engine_traits::{Result, TtlProperties, TtlPropertiesExt}; +use tikv_util::error; +use tirocks::properties::table::user::{ + Context, EntryType, SequenceNumber, TablePropertiesCollector, TablePropertiesCollectorFactory, + UserCollectedProperties, +}; + +use super::{DecodeProperties, EncodeProperties}; +use crate::RocksEngine; + +const PROP_MAX_EXPIRE_TS: &str = "tikv.max_expire_ts"; +const PROP_MIN_EXPIRE_TS: &str = "tikv.min_expire_ts"; + +fn encode_ttl(ttl_props: &TtlProperties, props: &mut impl EncodeProperties) { + props.encode_u64(PROP_MAX_EXPIRE_TS, ttl_props.max_expire_ts); + props.encode_u64(PROP_MIN_EXPIRE_TS, ttl_props.min_expire_ts); +} + +pub(super) fn decode_ttl(props: &impl DecodeProperties) -> codec::Result { + let res = TtlProperties { + max_expire_ts: props.decode_u64(PROP_MAX_EXPIRE_TS)?, + min_expire_ts: props.decode_u64(PROP_MIN_EXPIRE_TS)?, + }; + Ok(res) +} + +impl TtlPropertiesExt for RocksEngine { + fn get_range_ttl_properties_cf( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> Result> { + let collection = self.properties_of_tables_in_range(cf, &[(start_key, end_key)])?; + if collection.is_empty() { + return Ok(vec![]); + } + + let mut res = Vec::new(); + for (file_name, v) in &*collection { + let prop = match decode_ttl(v.user_collected_properties()) { + Ok(v) => v, + Err(_) => continue, + }; + res.push((std::str::from_utf8(file_name).unwrap().to_string(), prop)); + } + Ok(res) + } +} + +/// Can only be used for default CF. +pub struct TtlPropertiesCollector { + prop: TtlProperties, + _phantom: PhantomData, +} + +impl TtlPropertiesCollector { + fn finish(&mut self, properties: &mut impl EncodeProperties) { + if self.prop.max_expire_ts == 0 && self.prop.min_expire_ts == 0 { + return; + } + encode_ttl(&self.prop, properties); + } +} + +impl TablePropertiesCollector for TtlPropertiesCollector { + fn name(&self) -> &CStr { + ttl_properties_collector_name() + } + + fn add( + &mut self, + key: &[u8], + value: &[u8], + entry_type: EntryType, + _: SequenceNumber, + _: u64, + ) -> tirocks::Result<()> { + if entry_type != EntryType::kEntryPut { + return Ok(()); + } + // Only consider data keys. + if !key.starts_with(keys::DATA_PREFIX_KEY) { + return Ok(()); + } + // Only consider raw keys. + if F::parse_key_mode(&key[keys::DATA_PREFIX_KEY.len()..]) != KeyMode::Raw { + return Ok(()); + } + + match F::decode_raw_value(value) { + Ok(RawValue { + expire_ts: Some(expire_ts), + .. + }) => { + self.prop.max_expire_ts = std::cmp::max(self.prop.max_expire_ts, expire_ts); + if self.prop.min_expire_ts == 0 { + self.prop.min_expire_ts = expire_ts; + } else { + self.prop.min_expire_ts = std::cmp::min(self.prop.min_expire_ts, expire_ts); + } + } + Err(err) => { + error!( + "failed to get expire ts"; + "key" => log_wrappers::Value::key(key), + "value" => log_wrappers::Value::value(value), + "err" => %err, + ); + } + _ => {} + } + Ok(()) + } + + fn finish(&mut self, properties: &mut UserCollectedProperties) -> tirocks::Result<()> { + self.finish(properties); + Ok(()) + } +} + +fn ttl_properties_collector_name() -> &'static CStr { + CStr::from_bytes_with_nul(b"tikv.ttl-properties-collector\0").unwrap() +} + +#[derive(Default)] +pub struct TtlPropertiesCollectorFactory { + _phantom: PhantomData, +} + +impl TablePropertiesCollectorFactory for TtlPropertiesCollectorFactory { + type Collector = TtlPropertiesCollector; + + fn name(&self) -> &CStr { + ttl_properties_collector_name() + } + + fn create_table_properties_collector(&self, _: Context) -> TtlPropertiesCollector { + TtlPropertiesCollector { + prop: Default::default(), + _phantom: PhantomData, + } + } +} + +#[cfg(test)] +mod tests { + use api_version::test_kv_format_impl; + use collections::HashMap; + use kvproto::kvrpcpb::ApiVersion; + use tikv_util::time::UnixSecs; + + use super::*; + + #[test] + fn test_ttl_properties() { + test_kv_format_impl!(test_ttl_properties_impl); + } + + fn test_ttl_properties_impl() { + let get_properties = |case: &[(&'static str, u64)]| -> codec::Result { + let mut collector = TtlPropertiesCollector:: { + prop: Default::default(), + _phantom: PhantomData, + }; + for &(k, ts) in case { + let v = RawValue { + user_value: &[0; 10][..], + expire_ts: Some(ts), + is_delete: false, + }; + collector + .add( + k.as_bytes(), + &F::encode_raw_value(v), + EntryType::kEntryPut, + 0, + 0, + ) + .unwrap(); + } + for &(k, _) in case { + let v = vec![0; 10]; + collector + .add(k.as_bytes(), &v, EntryType::kEntryOther, 0, 0) + .unwrap(); + } + let mut result = HashMap::default(); + collector.finish(&mut result); + decode_ttl(&result) + }; + + let case1 = [ + ("zr\0a", 0), + ("zr\0b", UnixSecs::now().into_inner()), + ("zr\0c", 1), + ("zr\0d", u64::MAX), + ("zr\0e", 0), + ]; + let props = get_properties(&case1).unwrap(); + assert_eq!(props.max_expire_ts, u64::MAX); + match F::TAG { + ApiVersion::V1 => unreachable!(), + ApiVersion::V1ttl => assert_eq!(props.min_expire_ts, 1), + // expire_ts = 0 is no longer a special case in API V2 + ApiVersion::V2 => assert_eq!(props.min_expire_ts, 0), + } + + let case2 = [("zr\0a", 0)]; + get_properties(&case2).unwrap_err(); + + let case3 = []; + get_properties(&case3).unwrap_err(); + + let case4 = [("zr\0a", 1)]; + let props = get_properties(&case4).unwrap(); + assert_eq!(props.max_expire_ts, 1); + assert_eq!(props.min_expire_ts, 1); + } +} diff --git a/components/engine_tirocks/src/snapshot.rs b/components/engine_tirocks/src/snapshot.rs new file mode 100644 index 00000000000..2eef78fc0e5 --- /dev/null +++ b/components/engine_tirocks/src/snapshot.rs @@ -0,0 +1,84 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Debug, Formatter}, + sync::Arc, +}; + +use engine_traits::Result; +use tirocks::{db::RawCfHandle, option::ReadOptions, Db, Iterator, Snapshot}; + +use crate::{db_vector::RocksPinSlice, engine_iterator, r2e, util, RocksSnapIterator}; + +pub struct RocksSnapshot(Arc>>); + +impl RocksSnapshot { + #[inline] + pub(crate) fn new(db: Arc) -> Self { + Self(Arc::new(Snapshot::new(db))) + } + + #[inline] + fn get( + &self, + opts: &engine_traits::ReadOptions, + handle: &RawCfHandle, + key: &[u8], + ) -> Result> { + let mut opt = ReadOptions::default(); + opt.set_fill_cache(opts.fill_cache()); + // TODO: reuse slice. + let mut slice = RocksPinSlice::default(); + match self.0.get_pinned(&mut opt, handle, key, &mut slice.0) { + Ok(true) => Ok(Some(slice)), + Ok(false) => Ok(None), + Err(s) => Err(r2e(s)), + } + } +} + +impl engine_traits::Snapshot for RocksSnapshot {} + +impl Debug for RocksSnapshot { + fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { + write!(fmt, "tirocks Snapshot Impl") + } +} + +impl engine_traits::Iterable for RocksSnapshot { + type Iterator = RocksSnapIterator; + + fn iterator_opt(&self, cf: &str, opts: engine_traits::IterOptions) -> Result { + let opt = engine_iterator::to_tirocks_opt(opts); + let handle = util::cf_handle(self.0.db(), cf)?; + Ok(RocksSnapIterator::from_raw(Iterator::new( + self.0.clone(), + opt, + handle, + ))) + } +} + +impl engine_traits::Peekable for RocksSnapshot { + type DbVector = RocksPinSlice; + + #[inline] + fn get_value_opt( + &self, + opts: &engine_traits::ReadOptions, + key: &[u8], + ) -> Result> { + self.get(opts, self.0.db().default_cf(), key) + } + + #[inline] + fn get_value_cf_opt( + &self, + opts: &engine_traits::ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + let handle = util::cf_handle(self.0.db(), cf)?; + self.get(opts, handle, key) + } +} diff --git a/components/engine_tirocks/src/status.rs b/components/engine_tirocks/src/status.rs new file mode 100644 index 00000000000..13ae730562f --- /dev/null +++ b/components/engine_tirocks/src/status.rs @@ -0,0 +1,123 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub fn to_engine_trait_status(s: tirocks::Status) -> engine_traits::Status { + let code = match s.code() { + tirocks::Code::kOk => engine_traits::Code::Ok, + tirocks::Code::kNotFound => engine_traits::Code::NotFound, + tirocks::Code::kCorruption => engine_traits::Code::Corruption, + tirocks::Code::kNotSupported => engine_traits::Code::NotSupported, + tirocks::Code::kInvalidArgument => engine_traits::Code::InvalidArgument, + tirocks::Code::kIOError => engine_traits::Code::IoError, + tirocks::Code::kMergeInProgress => engine_traits::Code::MergeInProgress, + tirocks::Code::kIncomplete => engine_traits::Code::Incomplete, + tirocks::Code::kShutdownInProgress => engine_traits::Code::ShutdownInProgress, + tirocks::Code::kTimedOut => engine_traits::Code::TimedOut, + tirocks::Code::kAborted => engine_traits::Code::Aborted, + tirocks::Code::kBusy => engine_traits::Code::Busy, + tirocks::Code::kExpired => engine_traits::Code::Expired, + tirocks::Code::kTryAgain => engine_traits::Code::TryAgain, + tirocks::Code::kCompactionTooLarge => engine_traits::Code::CompactionTooLarge, + tirocks::Code::kColumnFamilyDropped => engine_traits::Code::ColumnFamilyDropped, + tirocks::Code::kMaxCode => unreachable!(), + }; + let sev = match s.severity() { + tirocks::Severity::kNoError => engine_traits::Severity::NoError, + tirocks::Severity::kSoftError => engine_traits::Severity::SoftError, + tirocks::Severity::kHardError => engine_traits::Severity::HardError, + tirocks::Severity::kFatalError => engine_traits::Severity::FatalError, + tirocks::Severity::kUnrecoverableError => engine_traits::Severity::UnrecoverableError, + tirocks::Severity::kMaxSeverity => unreachable!(), + }; + let sub_code = match s.sub_code() { + tirocks::SubCode::kNone => engine_traits::SubCode::None, + tirocks::SubCode::kMutexTimeout => engine_traits::SubCode::MutexTimeout, + tirocks::SubCode::kLockTimeout => engine_traits::SubCode::LockTimeout, + tirocks::SubCode::kLockLimit => engine_traits::SubCode::LockLimit, + tirocks::SubCode::kNoSpace => engine_traits::SubCode::NoSpace, + tirocks::SubCode::kDeadlock => engine_traits::SubCode::Deadlock, + tirocks::SubCode::kStaleFile => engine_traits::SubCode::StaleFile, + tirocks::SubCode::kMemoryLimit => engine_traits::SubCode::MemoryLimit, + tirocks::SubCode::kSpaceLimit => engine_traits::SubCode::SpaceLimit, + tirocks::SubCode::kPathNotFound => engine_traits::SubCode::PathNotFound, + tirocks::SubCode::KMergeOperandsInsufficientCapacity => { + engine_traits::SubCode::MergeOperandsInsufficientCapacity + } + tirocks::SubCode::kManualCompactionPaused => engine_traits::SubCode::ManualCompactionPaused, + tirocks::SubCode::kOverwritten => engine_traits::SubCode::Overwritten, + tirocks::SubCode::kTxnNotPrepared => engine_traits::SubCode::TxnNotPrepared, + tirocks::SubCode::kIOFenced => engine_traits::SubCode::IoFenced, + tirocks::SubCode::kMaxSubCode => unreachable!(), + }; + let mut es = match s.state().map(|s| String::from_utf8_lossy(s).into_owned()) { + Some(msg) => engine_traits::Status::with_error(code, msg), + None => engine_traits::Status::with_code(code), + }; + es.set_severity(sev).set_sub_code(sub_code); + es +} + +/// A function that will transform a rocksdb error to engine trait error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn r2e(s: tirocks::Status) -> engine_traits::Error { + engine_traits::Error::Engine(to_engine_trait_status(s)) +} + +/// A function that will transform a engine trait error to rocksdb error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn e2r(s: engine_traits::Error) -> tirocks::Status { + let s = match s { + engine_traits::Error::Engine(s) => s, + // Any better options than IOError? + _ => return tirocks::Status::with_error(tirocks::Code::kIOError, format!("{}", s)), + }; + let code = match s.code() { + engine_traits::Code::Ok => tirocks::Code::kOk, + engine_traits::Code::NotFound => tirocks::Code::kNotFound, + engine_traits::Code::Corruption => tirocks::Code::kCorruption, + engine_traits::Code::NotSupported => tirocks::Code::kNotSupported, + engine_traits::Code::InvalidArgument => tirocks::Code::kInvalidArgument, + engine_traits::Code::IoError => tirocks::Code::kIOError, + engine_traits::Code::MergeInProgress => tirocks::Code::kMergeInProgress, + engine_traits::Code::Incomplete => tirocks::Code::kIncomplete, + engine_traits::Code::ShutdownInProgress => tirocks::Code::kShutdownInProgress, + engine_traits::Code::TimedOut => tirocks::Code::kTimedOut, + engine_traits::Code::Aborted => tirocks::Code::kAborted, + engine_traits::Code::Busy => tirocks::Code::kBusy, + engine_traits::Code::Expired => tirocks::Code::kExpired, + engine_traits::Code::TryAgain => tirocks::Code::kTryAgain, + engine_traits::Code::CompactionTooLarge => tirocks::Code::kCompactionTooLarge, + engine_traits::Code::ColumnFamilyDropped => tirocks::Code::kColumnFamilyDropped, + }; + let sev = match s.severity() { + engine_traits::Severity::NoError => tirocks::Severity::kNoError, + engine_traits::Severity::SoftError => tirocks::Severity::kSoftError, + engine_traits::Severity::HardError => tirocks::Severity::kHardError, + engine_traits::Severity::FatalError => tirocks::Severity::kFatalError, + engine_traits::Severity::UnrecoverableError => tirocks::Severity::kUnrecoverableError, + }; + let sub_code = match s.sub_code() { + engine_traits::SubCode::None => tirocks::SubCode::kNone, + engine_traits::SubCode::MutexTimeout => tirocks::SubCode::kMutexTimeout, + engine_traits::SubCode::LockTimeout => tirocks::SubCode::kLockTimeout, + engine_traits::SubCode::LockLimit => tirocks::SubCode::kLockLimit, + engine_traits::SubCode::NoSpace => tirocks::SubCode::kNoSpace, + engine_traits::SubCode::Deadlock => tirocks::SubCode::kDeadlock, + engine_traits::SubCode::StaleFile => tirocks::SubCode::kStaleFile, + engine_traits::SubCode::MemoryLimit => tirocks::SubCode::kMemoryLimit, + engine_traits::SubCode::SpaceLimit => tirocks::SubCode::kSpaceLimit, + engine_traits::SubCode::PathNotFound => tirocks::SubCode::kPathNotFound, + engine_traits::SubCode::MergeOperandsInsufficientCapacity => { + tirocks::SubCode::KMergeOperandsInsufficientCapacity + } + engine_traits::SubCode::ManualCompactionPaused => tirocks::SubCode::kManualCompactionPaused, + engine_traits::SubCode::Overwritten => tirocks::SubCode::kOverwritten, + engine_traits::SubCode::TxnNotPrepared => tirocks::SubCode::kTxnNotPrepared, + engine_traits::SubCode::IoFenced => tirocks::SubCode::kIOFenced, + }; + let mut ts = tirocks::Status::with_error(code, s.state()); + ts.set_severity(sev); + ts.set_sub_code(sub_code); + ts +} diff --git a/components/engine_tirocks/src/util.rs b/components/engine_tirocks/src/util.rs new file mode 100644 index 00000000000..54a6139cb35 --- /dev/null +++ b/components/engine_tirocks/src/util.rs @@ -0,0 +1,406 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ffi::CStr, path::Path, sync::Arc}; + +use engine_traits::{Result, CF_DEFAULT}; +use slog_global::warn; +use tirocks::{ + db::{MultiCfBuilder, MultiCfTitanBuilder, RawCfHandle}, + env::Env, + option::RawCfOptions, + perf_context::PerfLevel, + slice_transform::SliceTransform, + CfOptions, Db, OpenOptions, Statistics, +}; + +use crate::{cf_options::RocksCfOptions, db_options::RocksDbOptions, r2e, RocksEngine}; + +/// Returns a Vec of cf which is in `a' but not in `b'. +fn cfs_diff<'a>(a: &[&'a str], b: &[&str]) -> Vec<&'a str> { + a.iter() + .filter(|x| !b.iter().any(|y| *x == y)) + .cloned() + .collect() +} + +/// Turns "dynamic level size" off for the existing column family which was off +/// before. Column families are small, HashMap isn't necessary. +fn adjust_dynamic_level_bytes( + cf_descs: &[(String, CfOptions)], + name: &str, + opt: &mut RawCfOptions, +) { + if let Some((_, exist_opt)) = cf_descs.iter().find(|(n, _)| n == name) { + let existed_dynamic_level_bytes = exist_opt.level_compaction_dynamic_level_bytes(); + if existed_dynamic_level_bytes != opt.level_compaction_dynamic_level_bytes() { + warn!( + "change dynamic_level_bytes for existing column family is danger"; + "old_value" => existed_dynamic_level_bytes, + "new_value" => opt.level_compaction_dynamic_level_bytes(), + ); + } + opt.set_level_compaction_dynamic_level_bytes(existed_dynamic_level_bytes); + } +} + +fn new_sanitized( + path: &Path, + db_opt: RocksDbOptions, + cf_opts: Vec<(&str, RocksCfOptions)>, +) -> Result { + if !db_opt.is_titan() { + let mut builder = MultiCfBuilder::new(db_opt.into_rocks()); + for (name, opt) in cf_opts { + builder.add_cf(name, opt.into_rocks()); + } + builder.open(path.as_ref()).map_err(r2e) + } else { + let mut builder = MultiCfTitanBuilder::new(db_opt.into_titan()); + for (name, opt) in cf_opts { + builder.add_cf(name, opt.into_titan()); + } + builder.open(path.as_ref()).map_err(r2e) + } +} + +pub fn new_engine(path: &Path, cfs: &[&str]) -> Result { + let mut db_opts = RocksDbOptions::default(); + db_opts.set_statistics(&Statistics::default()); + let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); + new_engine_opt(path, db_opts, cf_opts) +} + +pub fn new_engine_opt( + path: &Path, + mut db_opt: RocksDbOptions, + cf_opts: Vec<(&str, RocksCfOptions)>, +) -> Result { + let is_titan = db_opt.is_titan(); + for (_, opt) in &cf_opts { + // It's possible to convert non-titan to titan. But in our usage, they can't + // be mixed used. So assert to detect bugs. + assert_eq!(is_titan, opt.is_titan(), "Must pass the same option type"); + } + if cf_opts.iter().all(|(name, _)| *name != CF_DEFAULT) { + return Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "default cf must be specified", + ), + )); + } + if !RocksEngine::exists(path).unwrap_or(false) { + db_opt.set_create_if_missing(true); + db_opt.set_create_missing_column_families(true); + let db = new_sanitized(path, db_opt, cf_opts)?; + return Ok(RocksEngine::new(Arc::new(db))); + } + + db_opt.set_create_if_missing(false); + + // Lists all column families in current db. + let cfs_list = Db::list_cfs(&db_opt, path).map_err(r2e)?; + let existed: Vec<_> = cfs_list.iter().map(|v| v.as_str()).collect(); + let needed: Vec<_> = cf_opts.iter().map(|(name, _)| *name).collect(); + + let cf_descs = if !existed.is_empty() { + let res = if let Some(env) = db_opt.env() { + Db::load_latest_options(path, env, true) + } else { + Db::load_latest_options(path, &Env::default(), true) + }; + res.unwrap_or_else(|e| panic!("failed to load_latest_options {:?}", e)) + .1 + } else { + vec![] + }; + + // Lifetime hack. We need to make `&str` have smaller scope. It will be + // optimized away in release mode. + let mut cf_opts: Vec<_> = cf_opts.into_iter().collect(); + for cf in &existed { + if cf_opts.iter().all(|(name, _)| name != cf) { + if !is_titan { + cf_opts.push((cf, RocksCfOptions::default())); + } else { + cf_opts.push((cf, RocksCfOptions::default_titan())) + } + } + } + for (name, opt) in &mut cf_opts { + adjust_dynamic_level_bytes(&cf_descs, name, opt); + } + + // We have added all missing options by iterating `existed`. If two vecs still + // have same length, then they must have same column families dispite their + // orders. So just open db. + if needed.len() == existed.len() && needed.len() == cf_opts.len() { + let db = new_sanitized(path, db_opt, cf_opts)?; + return Ok(RocksEngine::new(Arc::new(db))); + } + + // Opens db. + db_opt.set_create_missing_column_families(true); + let mut db = new_sanitized(path, db_opt, cf_opts)?; + + // Drops discarded column families. + for cf in cfs_diff(&existed, &needed) { + // We have checked it at the very beginning, so it must be needed. + assert_ne!(cf, CF_DEFAULT); + db.destroy_cf(cf).map_err(r2e)?; + } + + Ok(RocksEngine::new(Arc::new(db))) +} + +/// A slice transform that removes fixed length suffix from key. +pub struct FixedSuffixSliceTransform { + name: &'static CStr, + suffix_len: usize, +} + +impl FixedSuffixSliceTransform { + pub fn new(name: &'static CStr, suffix_len: usize) -> FixedSuffixSliceTransform { + FixedSuffixSliceTransform { name, suffix_len } + } +} + +impl SliceTransform for FixedSuffixSliceTransform { + #[inline] + fn name(&self) -> &CStr { + self.name + } + + #[inline] + fn transform<'a>(&self, key: &'a [u8]) -> &'a [u8] { + let mid = key.len() - self.suffix_len; + &key[..mid] + } + + #[inline] + fn in_domain(&self, key: &[u8]) -> bool { + key.len() >= self.suffix_len + } +} + +/// A slice transform that keeps fixed length prefix from key. +pub struct FixedPrefixSliceTransform { + name: &'static CStr, + prefix_len: usize, +} + +impl FixedPrefixSliceTransform { + pub fn new(name: &'static CStr, prefix_len: usize) -> FixedPrefixSliceTransform { + FixedPrefixSliceTransform { name, prefix_len } + } +} + +impl SliceTransform for FixedPrefixSliceTransform { + #[inline] + fn name(&self) -> &CStr { + self.name + } + + #[inline] + fn transform<'a>(&self, key: &'a [u8]) -> &'a [u8] { + &key[..self.prefix_len] + } + + #[inline] + fn in_domain(&self, key: &[u8]) -> bool { + key.len() >= self.prefix_len + } +} + +/// A slice tranform that always returns identical key. +pub struct NoopSliceTransform { + name: &'static CStr, +} + +impl Default for NoopSliceTransform { + fn default() -> Self { + Self { + name: CStr::from_bytes_with_nul(b"NoopSliceTransform\0").unwrap(), + } + } +} + +impl SliceTransform for NoopSliceTransform { + #[inline] + fn name(&self) -> &CStr { + self.name + } + + #[inline] + fn transform<'a>(&self, key: &'a [u8]) -> &'a [u8] { + key + } + + #[inline] + fn in_domain(&self, _key: &[u8]) -> bool { + true + } +} + +pub fn to_rocks_perf_level(level: engine_traits::PerfLevel) -> PerfLevel { + match level { + engine_traits::PerfLevel::Uninitialized => PerfLevel::kUninitialized, + engine_traits::PerfLevel::Disable => PerfLevel::kDisable, + engine_traits::PerfLevel::EnableCount => PerfLevel::kEnableCount, + engine_traits::PerfLevel::EnableTimeExceptForMutex => PerfLevel::kEnableTimeExceptForMutex, + engine_traits::PerfLevel::EnableTimeAndCpuTimeExceptForMutex => { + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex + } + engine_traits::PerfLevel::EnableTime => PerfLevel::kEnableTime, + engine_traits::PerfLevel::OutOfBounds => PerfLevel::kOutOfBounds, + } +} + +pub fn to_engine_perf_level(level: PerfLevel) -> engine_traits::PerfLevel { + match level { + PerfLevel::kUninitialized => engine_traits::PerfLevel::Uninitialized, + PerfLevel::kDisable => engine_traits::PerfLevel::Disable, + PerfLevel::kEnableCount => engine_traits::PerfLevel::EnableCount, + PerfLevel::kEnableTimeExceptForMutex => engine_traits::PerfLevel::EnableTimeExceptForMutex, + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex => { + engine_traits::PerfLevel::EnableTimeAndCpuTimeExceptForMutex + } + PerfLevel::kEnableTime => engine_traits::PerfLevel::EnableTime, + PerfLevel::kOutOfBounds => engine_traits::PerfLevel::OutOfBounds, + } +} + +pub fn cf_handle<'a>(db: &'a Db, cf: &str) -> Result<&'a RawCfHandle> { + db.cf(cf).ok_or_else(|| { + engine_traits::Error::Engine(engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + format!("cf {} not found", cf), + )) + }) +} + +#[cfg(test)] +mod tests { + use engine_traits::CF_DEFAULT; + use tempfile::Builder; + use tirocks::option::{ReadOptions, WriteOptions}; + + use super::*; + + #[test] + fn test_cfs_diff() { + let a = vec!["1", "2", "3"]; + let a_diff_a = cfs_diff(&a, &a); + assert!(a_diff_a.is_empty()); + let b = vec!["4"]; + assert_eq!(a, cfs_diff(&a, &b)); + let c = vec!["4", "5", "3", "6"]; + assert_eq!(vec!["1", "2"], cfs_diff(&a, &c)); + assert_eq!(vec!["4", "5", "6"], cfs_diff(&c, &a)); + let d = vec!["1", "2", "3", "4"]; + let a_diff_d = cfs_diff(&a, &d); + assert!(a_diff_d.is_empty()); + assert_eq!(vec!["4"], cfs_diff(&d, &a)); + } + + #[test] + fn test_new_engine_opt() { + let temp = Builder::new() + .prefix("_util_rocksdb_test_check_column_families") + .tempdir() + .unwrap(); + let path = temp.path(); + + // create db when db not exist + let mut cfs_opts = vec![(CF_DEFAULT, RocksCfOptions::default())]; + let build_cf_opt = || { + let mut opts = RocksCfOptions::default(); + opts.set_level_compaction_dynamic_level_bytes(true); + opts + }; + cfs_opts.push(("cf_dynamic_level_bytes", build_cf_opt())); + let db = new_engine_opt(path, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); + check_dynamic_level_bytes(&db); + drop(db); + + // add cf1. + let cfs_opts = vec![ + (CF_DEFAULT, build_cf_opt()), + ("cf_dynamic_level_bytes", build_cf_opt()), + ("cf1", build_cf_opt()), + ]; + let db = new_engine_opt(path, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + check_dynamic_level_bytes(&db); + for name in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + let handle = db.cf(name).unwrap(); + db.as_inner() + .put(&WriteOptions::default(), handle, b"k", b"v") + .unwrap(); + } + drop(db); + + // change order should not cause data corruption. + let cfs_opts = vec![ + ("cf_dynamic_level_bytes", build_cf_opt()), + ("cf1", build_cf_opt()), + (CF_DEFAULT, build_cf_opt()), + ]; + let db = new_engine_opt(path, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + check_dynamic_level_bytes(&db); + let read_opt = ReadOptions::default(); + for name in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + let handle = db.cf(name).unwrap(); + assert_eq!( + db.as_inner().get(&read_opt, handle, b"k").unwrap().unwrap(), + b"v" + ); + } + drop(db); + + // drop cf1. + let cfs = vec![CF_DEFAULT, "cf_dynamic_level_bytes"]; + let db = new_engine(path, &cfs).unwrap(); + column_families_must_eq(path, cfs); + check_dynamic_level_bytes(&db); + drop(db); + + // drop all cfs. + new_engine(path, &[CF_DEFAULT]).unwrap(); + column_families_must_eq(path, vec![CF_DEFAULT]); + + // not specifying default cf should error. + new_engine(path, &[]).unwrap_err(); + column_families_must_eq(path, vec![CF_DEFAULT]); + } + + fn column_families_must_eq(path: &Path, excepted: Vec<&str>) { + let opts = RocksDbOptions::default(); + let cfs_list = Db::list_cfs(&opts, path).unwrap(); + + let mut cfs_existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); + let mut cfs_excepted: Vec<&str> = excepted.clone(); + cfs_existed.sort_unstable(); + cfs_excepted.sort_unstable(); + assert_eq!(cfs_existed, cfs_excepted); + } + + fn check_dynamic_level_bytes(db: &RocksEngine) { + let mut handle = db.cf(CF_DEFAULT).unwrap(); + let mut tmp_cf_opts = db.as_inner().cf_options(handle); + assert!( + !tmp_cf_opts + .cf_options() + .level_compaction_dynamic_level_bytes() + ); + handle = db.cf("cf_dynamic_level_bytes").unwrap(); + tmp_cf_opts = db.as_inner().cf_options(handle); + assert!( + tmp_cf_opts + .cf_options() + .level_compaction_dynamic_level_bytes() + ); + } +} diff --git a/components/engine_tirocks/src/write_batch.rs b/components/engine_tirocks/src/write_batch.rs new file mode 100644 index 00000000000..1671e686917 --- /dev/null +++ b/components/engine_tirocks/src/write_batch.rs @@ -0,0 +1,383 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{Result, WriteBatchExt as _}; +use tirocks::{option::WriteOptions, WriteBatch}; + +use crate::{r2e, RocksEngine}; + +const WRITE_BATCH_MAX_BATCH_NUM: usize = 16; +const WRITE_BATCH_MAX_KEY_NUM: usize = 16; + +impl engine_traits::WriteBatchExt for RocksEngine { + type WriteBatch = RocksWriteBatchVec; + + const WRITE_BATCH_MAX_KEYS: usize = 256; + + #[inline] + fn write_batch(&self) -> RocksWriteBatchVec { + self.write_batch_with_cap(1) + } + + #[inline] + fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { + RocksWriteBatchVec::with_unit_capacity(self, cap) + } +} + +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which +/// splits a large WriteBatch into many smaller ones and then any thread could +/// help to deal with these small WriteBatch when it is calling +/// `MultiBatchCommit` and wait the front writer to finish writing. +/// `MultiBatchWrite` will perform much better than traditional +/// `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more +/// stable and becomes compatible with Titan. +pub struct RocksWriteBatchVec { + engine: RocksEngine, + wbs: Vec, + save_points: Vec, + index: usize, +} + +impl RocksWriteBatchVec { + pub fn with_unit_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatchVec { + let wb = WriteBatch::with_capacity(cap); + RocksWriteBatchVec { + engine: engine.clone(), + wbs: vec![wb], + save_points: vec![], + index: 0, + } + } + + /// `check_switch_batch` will split a large WriteBatch into many smaller + /// ones. This is to avoid a large WriteBatch blocking write_thread too + /// long. + #[inline(always)] + fn check_switch_batch(&mut self) { + if self.engine.multi_batch_write() + && self.wbs[self.index].count() >= WRITE_BATCH_MAX_KEY_NUM + { + self.index += 1; + if self.index >= self.wbs.len() { + self.wbs.push(WriteBatch::default()); + } + } + } +} + +/// Converts engine_traits options to tirocks write options. +pub fn to_tirocks_opt(opt: &engine_traits::WriteOptions) -> WriteOptions { + let mut r = WriteOptions::default(); + r.set_sync(opt.sync()) + .set_no_slowdown(opt.no_slowdown()) + .set_disable_wal(opt.disable_wal()) + + // TODO: enable it. + .set_memtable_insert_hint_per_batch(false); + r +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &engine_traits::WriteOptions) -> Result { + let opts = to_tirocks_opt(opts); + if self.engine.multi_batch_write() { + self.engine + .as_inner() + .write_multi(&opts, &mut self.wbs[..=self.index]) + .map_err(r2e) + } else { + self.engine + .as_inner() + .write(&opts, &mut self.wbs[0]) + .map_err(r2e) + } + } + + fn data_size(&self) -> usize { + let mut size = 0; + for w in &self.wbs[..=self.index] { + size += w.as_bytes().len(); + } + size + } + + fn count(&self) -> usize { + let mut size = 0; + for w in &self.wbs[..=self.index] { + size += w.count(); + } + size + } + + fn is_empty(&self) -> bool { + self.wbs[0].as_bytes().is_empty() + } + + #[inline] + fn should_write_to_engine(&self) -> bool { + if self.engine.multi_batch_write() { + self.index >= WRITE_BATCH_MAX_BATCH_NUM + } else { + self.wbs[0].count() > RocksEngine::WRITE_BATCH_MAX_KEYS + } + } + + fn clear(&mut self) { + for i in 0..=self.index { + self.wbs[i].clear(); + } + self.save_points.clear(); + // Avoid making the wbs too big at one time, then the memory will be kept + // after reusing + if self.index > WRITE_BATCH_MAX_BATCH_NUM { + self.wbs.shrink_to(WRITE_BATCH_MAX_BATCH_NUM); + } + self.index = 0; + } + + fn set_save_point(&mut self) { + self.wbs[self.index].set_save_point(); + self.save_points.push(self.index); + } + + fn pop_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + return self.wbs[x].pop_save_point().map_err(r2e); + } + Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "no save point", + ), + )) + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + for i in x + 1..=self.index { + self.wbs[i].clear(); + } + self.index = x; + return self.wbs[x].rollback_to_save_point().map_err(r2e); + } + Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "no save point", + ), + )) + } + + fn merge(&mut self, mut other: Self) -> Result<()> { + if !self.engine.multi_batch_write() { + let self_wb = &mut self.wbs[0]; + for wb in &other.wbs[..=other.index] { + self_wb.append(wb).map_err(r2e)?; + } + return Ok(()); + } + let self_wb = &mut self.wbs[self.index]; + let mut other_start = 0; + if self_wb.count() < WRITE_BATCH_MAX_KEY_NUM { + self_wb.append(&other.wbs[0]).map_err(r2e)?; + other_start = 1; + } + // From this point, either of following statements is true: + // - self_wb.count() >= WRITE_BATCH_MAX_KEY_NUM + // - other.index == 0 + if other.index >= other_start { + for wb in other.wbs.drain(other_start..=other.index) { + self.index += 1; + if self.wbs.len() == self.index { + self.wbs.push(wb); + } else { + self.wbs[self.index] = wb; + } + } + } + Ok(()) + } +} + +impl engine_traits::Mutable for RocksWriteBatchVec { + fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.as_inner().default_cf(); + self.wbs[self.index].put(handle, key, value).map_err(r2e) + } + + fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.cf(cf)?; + self.wbs[self.index].put(handle, key, value).map_err(r2e) + } + + fn delete(&mut self, key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.as_inner().default_cf(); + self.wbs[self.index].delete(handle, key).map_err(r2e) + } + + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.cf(cf)?; + self.wbs[self.index].delete(handle, key).map_err(r2e) + } + + fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.as_inner().default_cf(); + self.wbs[self.index] + .delete_range(handle, begin_key, end_key) + .map_err(r2e) + } + + fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.cf(cf)?; + self.wbs[self.index] + .delete_range(handle, begin_key, end_key) + .map_err(r2e) + } +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use engine_traits::{Mutable, Peekable, WriteBatch, WriteBatchExt, CF_DEFAULT}; + use tempfile::Builder; + + use super::*; + use crate::{ + cf_options::RocksCfOptions, db_options::RocksDbOptions, new_engine_opt, RocksEngine, + }; + + fn new_engine(path: &Path, multi_batch_write: bool) -> RocksEngine { + let mut db_opt = RocksDbOptions::default(); + db_opt + .set_unordered_write(false) + .set_enable_pipelined_write(!multi_batch_write) + .set_multi_batch_write(multi_batch_write); + let engine = new_engine_opt( + &path.join("db"), + db_opt, + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert_eq!( + engine.as_inner().db_options().multi_batch_write(), + multi_batch_write + ); + engine + } + + #[test] + fn test_should_write_to_engine_with_pipeline_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let engine = new_engine(path.path(), false); + let mut wb = engine.write_batch(); + for _ in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + + let v = engine.get_value(b"aaa").unwrap(); + + assert!(v.is_some()); + assert_eq!(v.unwrap(), b"bbb"); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_should_write_to_engine_with_multi_batch_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let engine = new_engine(path.path(), true); + let mut wb = engine.write_batch(); + for _ in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _ in 0..WRITE_BATCH_MAX_BATCH_NUM * WRITE_BATCH_MAX_KEY_NUM { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_write_batch_merge() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + for multi_batch_write in &[false, true] { + let engine = new_engine(path.path(), *multi_batch_write); + let mut wb = engine.write_batch(); + for _ in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert_eq!(wb.count(), RocksEngine::WRITE_BATCH_MAX_KEYS); + + let mut wb2 = engine.write_batch(); + for _ in 0..WRITE_BATCH_MAX_KEY_NUM / 2 { + wb2.put(b"aaa", b"bbb").unwrap(); + } + assert_eq!(wb2.count(), WRITE_BATCH_MAX_KEY_NUM / 2); + // The only batch should be moved directly. + wb.merge(wb2).unwrap(); + assert_eq!( + wb.count(), + RocksEngine::WRITE_BATCH_MAX_KEYS + WRITE_BATCH_MAX_KEY_NUM / 2 + ); + if *multi_batch_write { + assert_eq!( + wb.wbs.len(), + RocksEngine::WRITE_BATCH_MAX_KEYS / WRITE_BATCH_MAX_KEY_NUM + 1 + ); + } + + let mut wb3 = engine.write_batch(); + for _ in 0..WRITE_BATCH_MAX_KEY_NUM / 2 * 3 { + wb3.put(b"aaa", b"bbb").unwrap(); + } + assert_eq!(wb3.count(), WRITE_BATCH_MAX_KEY_NUM / 2 * 3); + // The half batch should be merged together, and then move the left one. + wb.merge(wb3).unwrap(); + assert_eq!( + wb.count(), + RocksEngine::WRITE_BATCH_MAX_KEYS + WRITE_BATCH_MAX_KEY_NUM * 2 + ); + if *multi_batch_write { + assert_eq!( + wb.wbs.len(), + RocksEngine::WRITE_BATCH_MAX_KEYS / WRITE_BATCH_MAX_KEY_NUM + 2 + ); + } + } + } +} diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index 3b8c3efa33b..664bc72afc5 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -6,23 +6,28 @@ publish = false [features] failpoints = ["fail/failpoints"] +testexport = [] [dependencies] -case_macros = { path = "../case_macros" } -error_code = { path = "../error_code", default-features = false } +case_macros = { workspace = true } +collections = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } +file_system = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } +lazy_static = "1.0" +log_wrappers = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] serde_derive = "1.0" diff --git a/components/engine_traits/src/cf_defs.rs b/components/engine_traits/src/cf_defs.rs index f47a63e69e3..27546dfc1c1 100644 --- a/components/engine_traits/src/cf_defs.rs +++ b/components/engine_traits/src/cf_defs.rs @@ -9,16 +9,16 @@ pub const CF_RAFT: CfName = "raft"; pub const LARGE_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; pub const ALL_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE, CF_RAFT]; pub const DATA_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; +pub const DATA_CFS_LEN: usize = DATA_CFS.len(); + +pub fn data_cf_offset(cf: &str) -> usize { + let cf = if cf.is_empty() { CF_DEFAULT } else { cf }; + DATA_CFS.iter().position(|c| *c == cf).expect(cf) +} pub fn name_to_cf(name: &str) -> Option { if name.is_empty() { return Some(CF_DEFAULT); } - for c in ALL_CFS { - if name == *c { - return Some(c); - } - } - - None + ALL_CFS.iter().copied().find(|c| name == *c) } diff --git a/components/engine_traits/src/cf_names.rs b/components/engine_traits/src/cf_names.rs index 714139c8530..c33ac11081a 100644 --- a/components/engine_traits/src/cf_names.rs +++ b/components/engine_traits/src/cf_names.rs @@ -1,5 +1,5 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -pub trait CFNamesExt { +pub trait CfNamesExt { fn cf_names(&self) -> Vec<&str>; } diff --git a/components/engine_traits/src/cf_options.rs b/components/engine_traits/src/cf_options.rs index 2e130cbf73c..5fb85aedf95 100644 --- a/components/engine_traits/src/cf_options.rs +++ b/components/engine_traits/src/cf_options.rs @@ -1,28 +1,30 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use crate::{db_options::TitanDBOptions, sst_partitioner::SstPartitionerFactory, Result}; +use crate::{db_options::TitanCfOptions, sst_partitioner::SstPartitionerFactory, Result}; /// Trait for engines with column family options -pub trait CFOptionsExt { - type ColumnFamilyOptions: ColumnFamilyOptions; +pub trait CfOptionsExt { + type CfOptions: CfOptions; - fn get_options_cf(&self, cf: &str) -> Result; + fn get_options_cf(&self, cf: &str) -> Result; fn set_options_cf(&self, cf: &str, options: &[(&str, &str)]) -> Result<()>; } -pub trait ColumnFamilyOptions { - type TitanDBOptions: TitanDBOptions; +pub trait CfOptions { + type TitanCfOptions: TitanCfOptions; fn new() -> Self; fn get_max_write_buffer_number(&self) -> u32; - fn get_level_zero_slowdown_writes_trigger(&self) -> u32; - fn get_level_zero_stop_writes_trigger(&self) -> u32; + /// Negative means no limit. + fn get_level_zero_slowdown_writes_trigger(&self) -> i32; + /// Negative means no limit. + fn get_level_zero_stop_writes_trigger(&self) -> i32; fn set_level_zero_file_num_compaction_trigger(&mut self, v: i32); fn get_soft_pending_compaction_bytes_limit(&self) -> u64; fn get_hard_pending_compaction_bytes_limit(&self) -> u64; fn get_block_cache_capacity(&self) -> u64; - fn set_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String>; - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions); + fn set_block_cache_capacity(&self, capacity: u64) -> Result<()>; + fn set_titan_cf_options(&mut self, opts: &Self::TitanCfOptions); fn get_target_file_size_base(&self) -> u64; fn set_disable_auto_compactions(&mut self, v: bool); fn get_disable_auto_compactions(&self) -> bool; diff --git a/components/engine_traits/src/checkpoint.rs b/components/engine_traits/src/checkpoint.rs new file mode 100644 index 00000000000..6b966d806fe --- /dev/null +++ b/components/engine_traits/src/checkpoint.rs @@ -0,0 +1,22 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use crate::Result; + +pub trait Checkpointable { + type Checkpointer: Checkpointer; + + fn new_checkpointer(&self) -> Result; + + fn merge(&self, dbs: &[&Self]) -> Result<()>; +} + +pub trait Checkpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()>; +} diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index a7e8636769b..05590a1ff32 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -4,16 +4,30 @@ use std::collections::BTreeMap; -use crate::errors::Result; +use crate::{errors::Result, CfNamesExt}; -pub trait CompactExt { +pub trait CompactExt: CfNamesExt { type CompactedEvent: CompactedEvent; - /// Checks whether any column family sets `disable_auto_compactions` to `True` or not. + /// Checks whether any column family sets `disable_auto_compactions` to + /// `True` or not. fn auto_compactions_is_disabled(&self) -> Result; - /// Compacts the column families in the specified range by manual or not. fn compact_range( + &self, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + exclusive_manual: bool, + max_subcompactions: u32, + ) -> Result<()> { + for cf in self.cf_names() { + self.compact_range_cf(cf, start_key, end_key, exclusive_manual, max_subcompactions)?; + } + Ok(()) + } + + /// Compacts the column families in the specified range by manual or not. + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -24,16 +38,23 @@ pub trait CompactExt { /// Compacts files in the range and above the output level. /// Compacts all files if the range is not specified. - /// Compacts all files to the bottommost level if the output level is not specified. + /// Compacts all files to the bottommost level if the output level is not + /// specified. fn compact_files_in_range( &self, start: Option<&[u8]>, end: Option<&[u8]>, output_level: Option, - ) -> Result<()>; + ) -> Result<()> { + for cf in self.cf_names() { + self.compact_files_in_range_cf(cf, start, end, output_level)?; + } + Ok(()) + } - /// Compacts files in the range and above the output level of the given column family. - /// Compacts all files to the bottommost level if the output level is not specified. + /// Compacts files in the range and above the output level of the given + /// column family. Compacts all files to the bottommost level if the + /// output level is not specified. fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index 7a6042d3db4..2c6e9c3d4e8 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -3,16 +3,16 @@ use crate::errors::Result; /// A trait for engines that support setting global options -pub trait DBOptionsExt { - type DBOptions: DBOptions; +pub trait DbOptionsExt { + type DbOptions: DbOptions; - fn get_db_options(&self) -> Self::DBOptions; + fn get_db_options(&self) -> Self::DbOptions; fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()>; } /// A handle to a database's options -pub trait DBOptions { - type TitanDBOptions: TitanDBOptions; +pub trait DbOptions { + type TitanDbOptions: TitanCfOptions; fn new() -> Self; fn get_max_background_jobs(&self) -> i32; @@ -20,11 +20,13 @@ pub trait DBOptions { fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()>; fn get_rate_limiter_auto_tuned(&self) -> Option; fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()>; - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions); + fn set_flush_size(&mut self, f: usize) -> Result<()>; + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()>; + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); } /// Titan-specefic options -pub trait TitanDBOptions { +pub trait TitanCfOptions { fn new() -> Self; fn set_min_blob_size(&mut self, size: u64); } diff --git a/components/engine_traits/src/db_vector.rs b/components/engine_traits/src/db_vector.rs index 9caf55d9e22..08bea9f11e5 100644 --- a/components/engine_traits/src/db_vector.rs +++ b/components/engine_traits/src/db_vector.rs @@ -6,4 +6,4 @@ use std::{fmt::Debug, ops::Deref}; /// /// The database may optimize this type to be a view into /// its own cache. -pub trait DBVector: Debug + Deref + for<'a> PartialEq<&'a [u8]> {} +pub trait DbVector: Debug + Deref + for<'a> PartialEq<&'a [u8]> {} diff --git a/components/engine_traits/src/encryption.rs b/components/engine_traits/src/encryption.rs index 51b19c05907..16f29d16d75 100644 --- a/components/engine_traits/src/encryption.rs +++ b/components/engine_traits/src/encryption.rs @@ -12,7 +12,7 @@ pub trait EncryptionKeyManager: Sync + Send { fn link_file(&self, src_fname: &str, dst_fname: &str) -> Result<()>; } -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq)] pub struct FileEncryptionInfo { pub method: EncryptionMethod, pub key: Vec, @@ -46,11 +46,12 @@ impl FileEncryptionInfo { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq)] pub enum EncryptionMethod { Unknown = 0, Plaintext = 1, Aes128Ctr = 2, Aes192Ctr = 3, Aes256Ctr = 4, + Sm4Ctr = 5, } diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index c4dad67e3c5..aa90c23b429 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::fmt::Debug; +use std::{fmt::Debug, str}; use crate::*; @@ -15,9 +15,9 @@ pub trait KvEngine: + SyncMutable + Iterable + WriteBatchExt - + DBOptionsExt - + CFNamesExt - + CFOptionsExt + + DbOptionsExt + + CfNamesExt + + CfOptionsExt + ImportExt + SstExt + CompactExt @@ -32,6 +32,7 @@ pub trait KvEngine: + Clone + Debug + Unpin + + Checkpointable + 'static { /// A consistent read-only snapshot of the database @@ -46,22 +47,28 @@ pub trait KvEngine: /// Flush metrics to prometheus /// /// `instance` is the label of the metric to flush. - fn flush_metrics(&self, _instance: &str) {} - - /// Reset internal statistics - fn reset_statistics(&self) {} + fn flush_metrics(&self, instance: &str) { + let mut reporter = Self::StatisticsReporter::new(instance); + reporter.collect(self); + reporter.flush(); + } /// Cast to a concrete engine type /// /// This only exists as a temporary hack during refactoring. /// It cannot be used forever. fn bad_downcast(&self) -> &T; -} -/// A factory trait to create new engine. -/// -// It should be named as `EngineFactory` for consistency, but we are about to rename -// engine to tablet, so always use tablet for new traits/types. -pub trait TabletFactory { - fn create_tablet(&self) -> Result; + /// Returns false if KvEngine can't apply snapshot for this region now. + /// Some KvEngines need to do some transforms before apply data from + /// snapshot. These procedures can be batched in background if there are + /// more than one incoming snapshots, thus not blocking applying thread. + fn can_apply_snapshot(&self, _is_timeout: bool, _new_batch: bool, _region_id: u64) -> bool { + true + } + + /// A method for test to expose inner db refcount in order to make sure a + /// full release of engine. + #[cfg(any(test, feature = "testexport"))] + fn inner_refcount(&self) -> usize; } diff --git a/components/engine_traits/src/engines.rs b/components/engine_traits/src/engines.rs index fd0fa961c06..569648f3c30 100644 --- a/components/engine_traits/src/engines.rs +++ b/components/engine_traits/src/engines.rs @@ -7,6 +7,7 @@ use crate::{ #[derive(Clone, Debug)] pub struct Engines { + // kv can be either global kv store, or the tablet in multirocks version. pub kv: K, pub raft: R, } @@ -19,11 +20,11 @@ impl Engines { } } - pub fn write_kv(&self, wb: &K::WriteBatch) -> Result<()> { + pub fn write_kv(&self, wb: &mut K::WriteBatch) -> Result { wb.write() } - pub fn write_kv_opt(&self, wb: &K::WriteBatch, opts: &WriteOptions) -> Result<()> { + pub fn write_kv_opt(&self, wb: &mut K::WriteBatch, opts: &WriteOptions) -> Result { wb.write_opt(opts) } diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index 12104e14a5c..6ef46ff7a70 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -6,11 +6,124 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use raft::{Error as RaftError, StorageError}; use thiserror::Error; +#[repr(u8)] +#[derive(Debug, Copy, Clone, Hash, PartialEq)] +pub enum Code { + Ok = 0, + NotFound = 1, + Corruption = 2, + NotSupported = 3, + InvalidArgument = 4, + IoError = 5, + MergeInProgress = 6, + Incomplete = 7, + ShutdownInProgress = 8, + TimedOut = 9, + Aborted = 10, + Busy = 11, + Expired = 12, + TryAgain = 13, + CompactionTooLarge = 14, + ColumnFamilyDropped = 15, +} + +#[repr(u8)] +#[derive(Debug, Copy, Clone, Hash, PartialEq)] +pub enum SubCode { + None = 0, + MutexTimeout = 1, + LockTimeout = 2, + LockLimit = 3, + NoSpace = 4, + Deadlock = 5, + StaleFile = 6, + MemoryLimit = 7, + SpaceLimit = 8, + PathNotFound = 9, + MergeOperandsInsufficientCapacity = 10, + ManualCompactionPaused = 11, + Overwritten = 12, + TxnNotPrepared = 13, + IoFenced = 14, +} + +#[repr(u8)] +#[derive(Debug, Copy, Clone, Hash, PartialEq)] +pub enum Severity { + NoError = 0, + SoftError = 1, + HardError = 2, + FatalError = 3, + UnrecoverableError = 4, +} + +#[repr(C)] +#[derive(Debug, Error)] +#[error("[{:?}] {:?}-{:?} {}", .code, .sub_code, .sev, .state)] +pub struct Status { + code: Code, + sub_code: SubCode, + sev: Severity, + state: String, +} + +impl Status { + pub fn with_code(code: Code) -> Status { + Self { + code, + sub_code: SubCode::None, + sev: Severity::NoError, + state: String::new(), + } + } + + pub fn with_error(code: Code, error: impl Into) -> Self { + Self { + code, + sub_code: SubCode::None, + sev: Severity::NoError, + state: error.into(), + } + } + + #[inline] + pub fn set_sub_code(&mut self, sub_code: SubCode) -> &mut Self { + self.sub_code = sub_code; + self + } + + #[inline] + pub fn set_severity(&mut self, sev: Severity) -> &mut Self { + self.sev = sev; + self + } + + #[inline] + pub fn code(&self) -> Code { + self.code + } + + #[inline] + pub fn sub_code(&self) -> SubCode { + self.sub_code + } + + #[inline] + pub fn severity(&self) -> Severity { + self.sev + } + + #[inline] + pub fn state(&self) -> &str { + &self.state + } +} + #[derive(Debug, Error)] pub enum Error { // Engine uses plain string as the error. - #[error("Storage Engine {0}")] - Engine(String), + #[error("Storage Engine {0:?}")] + Engine(#[from] Status), // FIXME: It should not know Region. #[error( "Key {} is out of [region {}] [{}, {})", @@ -29,7 +142,7 @@ pub enum Error { #[error("{0:?}")] Other(#[from] Box), #[error("CF {0} not found")] - CFName(String), + CfName(String), #[error("Codec {0}")] Codec(#[from] tikv_util::codec::Error), #[error("The entries of region is unavailable")] @@ -38,12 +151,6 @@ pub enum Error { EntriesCompacted, } -impl From for Error { - fn from(err: String) -> Self { - Error::Engine(err) - } -} - pub type Result = result::Result; impl ErrorCodeExt for Error { @@ -53,7 +160,7 @@ impl ErrorCodeExt for Error { Error::NotInRange { .. } => error_code::engine::NOT_IN_RANGE, Error::Protobuf(_) => error_code::engine::PROTOBUF, Error::Io(_) => error_code::engine::IO, - Error::CFName(_) => error_code::engine::CF_NAME, + Error::CfName(_) => error_code::engine::CF_NAME, Error::Codec(_) => error_code::engine::CODEC, Error::Other(_) => error_code::UNKNOWN, Error::EntriesUnavailable => error_code::engine::DATALOSS, diff --git a/components/engine_traits/src/file_system.rs b/components/engine_traits/src/file_system.rs index 9022aeb7dc2..51911b1f58e 100644 --- a/components/engine_traits/src/file_system.rs +++ b/components/engine_traits/src/file_system.rs @@ -2,15 +2,17 @@ use std::sync::Arc; -use file_system::{get_io_rate_limiter, get_io_type, IOOp, IORateLimiter}; +use file_system::{get_io_rate_limiter, get_io_type, IoOp, IoRateLimiter}; + +use crate::Result; pub trait FileSystemInspector: Sync + Send { - fn read(&self, len: usize) -> Result; - fn write(&self, len: usize) -> Result; + fn read(&self, len: usize) -> Result; + fn write(&self, len: usize) -> Result; } pub struct EngineFileSystemInspector { - limiter: Option>, + limiter: Option>, } impl EngineFileSystemInspector { @@ -21,7 +23,7 @@ impl EngineFileSystemInspector { } } - pub fn from_limiter(limiter: Option>) -> Self { + pub fn from_limiter(limiter: Option>) -> Self { EngineFileSystemInspector { limiter } } } @@ -33,19 +35,19 @@ impl Default for EngineFileSystemInspector { } impl FileSystemInspector for EngineFileSystemInspector { - fn read(&self, len: usize) -> Result { + fn read(&self, len: usize) -> Result { if let Some(limiter) = &self.limiter { let io_type = get_io_type(); - Ok(limiter.request(io_type, IOOp::Read, len)) + Ok(limiter.request(io_type, IoOp::Read, len)) } else { Ok(len) } } - fn write(&self, len: usize) -> Result { + fn write(&self, len: usize) -> Result { if let Some(limiter) = &self.limiter { let io_type = get_io_type(); - Ok(limiter.request(io_type, IOOp::Write, len)) + Ok(limiter.request(io_type, IoOp::Write, len)) } else { Ok(len) } diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs new file mode 100644 index 00000000000..8b0566f2cfb --- /dev/null +++ b/components/engine_traits/src/flush.rs @@ -0,0 +1,217 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! A helper class to detect flush event and trace apply index. +//! +//! The whole idea is when all CFs have flushed to disk, then the apply index +//! should be able to be advanced to the latest. The implementations depends on +//! the assumption that memtable/write buffer is frozen one by one and flushed +//! one by one. +//! +//! Because apply index can be arbitrary value after restart, so apply related +//! states like `RaftApplyState` and `RegionLocalState` are mapped to index. +//! Once apply index is confirmed, the latest states before apply index should +//! be used as the start state. + +use std::{ + collections::LinkedList, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, +}; + +use slog_global::info; +use tikv_util::set_panic_mark; + +use crate::{data_cf_offset, RaftEngine, RaftLogBatch, DATA_CFS_LEN}; + +#[derive(Debug)] +pub struct ApplyProgress { + cf: String, + apply_index: u64, + smallest_seqno: u64, +} + +impl ApplyProgress { + fn merge(&mut self, pr: ApplyProgress) { + debug_assert_eq!(self.cf, pr.cf); + debug_assert!(self.apply_index <= pr.apply_index); + self.apply_index = pr.apply_index; + } + + pub fn applied_index(&self) -> u64 { + self.apply_index + } + + pub fn cf(&self) -> &str { + &self.cf + } +} + +#[derive(Default, Debug)] +struct FlushProgress { + prs: LinkedList, + last_flushed: [u64; DATA_CFS_LEN], +} + +/// A share state between raftstore and underlying engine. +/// +/// raftstore will update state changes and corresponding apply index, when +/// flush, `PersistenceListener` will query states related to the memtable +/// and persist the relation to raft engine. +#[derive(Debug)] +pub struct FlushState { + applied_index: AtomicU64, +} + +impl FlushState { + pub fn new(applied_index: u64) -> Self { + Self { + applied_index: AtomicU64::new(applied_index), + } + } + + /// Set the latest applied index. + #[inline] + pub fn set_applied_index(&self, index: u64) { + self.applied_index.store(index, Ordering::Release); + } + + /// Query the applied index. + #[inline] + pub fn applied_index(&self) -> u64 { + self.applied_index.load(Ordering::Acquire) + } +} + +/// A helper trait to avoid exposing `RaftEngine` to `TabletFactory`. +pub trait StateStorage: Sync + Send { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress); +} + +/// A flush listener that maps memtable to apply index and persist the relation +/// to raft engine. +pub struct PersistenceListener { + region_id: u64, + tablet_index: u64, + state: Arc, + progress: Mutex, + storage: Arc, +} + +impl PersistenceListener { + pub fn new( + region_id: u64, + tablet_index: u64, + state: Arc, + storage: Arc, + ) -> Self { + Self { + region_id, + tablet_index, + state, + progress: Mutex::new(FlushProgress::default()), + storage, + } + } +} + +impl PersistenceListener { + pub fn flush_state(&self) -> &Arc { + &self.state + } + + /// Called when memtable is frozen. + /// + /// `smallest_seqno` should be the smallest seqno of the memtable. + pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64) { + // The correctness relies on the assumption that there will be only one + // thread writting to the DB and increasing apply index. + // Apply index will be set within DB lock, so it's correct even with manual + // flush. + let offset = data_cf_offset(&cf); + let apply_index = self.state.applied_index.load(Ordering::SeqCst); + let mut prs = self.progress.lock().unwrap(); + let flushed = prs.last_flushed[offset]; + if flushed > smallest_seqno { + panic!( + "sealed seqno has been flushed {} {} {} <= {}", + cf, apply_index, smallest_seqno, flushed + ); + } + prs.prs.push_back(ApplyProgress { + cf, + apply_index, + smallest_seqno, + }); + } + + /// Called a memtable finished flushing. + /// + /// `largest_seqno` should be the largest seqno of the generated file. + pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64, file_no: u64) { + // Maybe we should hook the compaction to avoid the file is compacted before + // being recorded. + let offset = data_cf_offset(cf); + let pr = { + let mut prs = self.progress.lock().unwrap(); + let flushed = prs.last_flushed[offset]; + if flushed >= largest_seqno { + // According to facebook/rocksdb#11183, it's possible OnFlushCompleted can be + // called out of order. But it's guaranteed files are installed in order. + info!("flush complete reorder found"; "flushed" => flushed, "largest_seqno" => largest_seqno, "file_no" => file_no, "cf" => cf); + return; + } + prs.last_flushed[offset] = largest_seqno; + let mut cursor = prs.prs.cursor_front_mut(); + let mut flushed_pr = None; + while let Some(pr) = cursor.current() { + if pr.cf != cf { + cursor.move_next(); + continue; + } + if pr.smallest_seqno <= largest_seqno { + match &mut flushed_pr { + None => flushed_pr = cursor.remove_current(), + Some(flushed_pr) => { + flushed_pr.merge(cursor.remove_current().unwrap()); + } + } + continue; + } + break; + } + match flushed_pr { + Some(pr) => pr, + None => { + set_panic_mark(); + panic!( + "[region_id={}] [tablet_index={}] {} {} {} not found in {:?}", + self.region_id, self.tablet_index, cf, largest_seqno, file_no, prs + ) + } + } + }; + self.storage + .persist_progress(self.region_id, self.tablet_index, pr); + } +} + +impl StateStorage for R { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress) { + if pr.apply_index == 0 { + return; + } + let mut batch = self.log_batch(1); + // TODO: It's possible that flush succeeds but fails to call + // `on_flush_completed` before exit. In this case the flushed data will + // be replayed again after restarted. To solve the problem, we need to + // (1) persist flushed file numbers in `on_flush_begin` and (2) check + // the file number in `on_compaction_begin`. After restart, (3) check if the + // file exists. If (1) && ((2) || (3)), then we don't need to replay the data. + batch + .put_flushed_index(region_id, &pr.cf, tablet_index, pr.apply_index) + .unwrap(); + self.consume(&mut batch, true).unwrap(); + } +} diff --git a/components/engine_traits/src/iterable.rs b/components/engine_traits/src/iterable.rs index a6dbdd2d03f..50fcfc2344b 100644 --- a/components/engine_traits/src/iterable.rs +++ b/components/engine_traits/src/iterable.rs @@ -31,13 +31,6 @@ use tikv_util::keybuilder::KeyBuilder; use crate::*; -/// A token indicating where an iterator "seek" operation should stop. -pub enum SeekKey<'a> { - Start, - End, - Key(&'a [u8]), -} - /// An iterator over a consistent set of keys and values. /// /// Iterators are implemented for `KvEngine`s and for `Snapshot`s. They see a @@ -56,15 +49,8 @@ pub enum SeekKey<'a> { pub trait Iterator: Send { /// Move the iterator to a specific key. /// - /// When `key` is `SeekKey::Start` or `SeekKey::End`, - /// `seek` and `seek_for_prev` behave identically. - /// The difference between the two functions is how they - /// behave for `SeekKey::Key`, and only when an exactly - /// matching keys is not found: - /// - /// When seeking with `SeekKey::Key`, and an exact match is not found, - /// `seek` sets the iterator to the next key greater than that - /// specified as `key`, if such a key exists; + /// When an exact match is not found, `seek` sets the iterator to the next + /// key greater than that specified as `key`, if such a key exists; /// `seek_for_prev` sets the iterator to the previous key less than /// that specified as `key`, if such a key exists. /// @@ -72,7 +58,7 @@ pub trait Iterator: Send { /// /// `true` if seeking succeeded and the iterator is valid, /// `false` if seeking failed and the iterator is invalid. - fn seek(&mut self, key: SeekKey<'_>) -> Result; + fn seek(&mut self, key: &[u8]) -> Result; /// Move the iterator to a specific key. /// @@ -83,79 +69,66 @@ pub trait Iterator: Send { /// /// `true` if seeking succeeded and the iterator is valid, /// `false` if seeking failed and the iterator is invalid. - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result; + fn seek_for_prev(&mut self, key: &[u8]) -> Result; - /// Short for `seek(SeekKey::Start)`. - fn seek_to_first(&mut self) -> Result { - self.seek(SeekKey::Start) - } + /// Seek to the first key in the engine. + fn seek_to_first(&mut self) -> Result; - /// Short for `seek(SeekKey::End)`. - fn seek_to_last(&mut self) -> Result { - self.seek(SeekKey::End) - } + /// Seek to the last key in the database. + fn seek_to_last(&mut self) -> Result; /// Move a valid iterator to the previous key. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn prev(&mut self) -> Result; /// Move a valid iterator to the next key. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn next(&mut self) -> Result; /// Retrieve the current key. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn key(&self) -> &[u8]; /// Retrieve the current value. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn value(&self) -> &[u8]; /// Returns `true` if the iterator points to a `key`/`value` pair. fn valid(&self) -> Result; } +pub trait RefIterable { + type Iterator<'a>: Iterator + where + Self: 'a; + + fn iter(&self, opts: IterOptions) -> Result>; +} + pub trait Iterable { type Iterator: Iterator; - fn iterator_opt(&self, opts: IterOptions) -> Result; - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result; - - fn iterator(&self) -> Result { - self.iterator_opt(IterOptions::default()) - } + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result; - fn iterator_cf(&self, cf: &str) -> Result { - self.iterator_cf_opt(cf, IterOptions::default()) + fn iterator(&self, cf: &str) -> Result { + self.iterator_opt(cf, IterOptions::default()) } /// scan the key between start_key(inclusive) and end_key(exclusive), /// the upper bound is omitted if end_key is empty - fn scan(&self, start_key: &[u8], end_key: &[u8], fill_cache: bool, f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, - { - let start = KeyBuilder::from_slice(start_key, DATA_KEY_PREFIX_LEN, 0); - let end = - (!end_key.is_empty()).then(|| KeyBuilder::from_slice(end_key, DATA_KEY_PREFIX_LEN, 0)); - let iter_opt = IterOptions::new(Some(start), end, fill_cache); - scan_impl(self.iterator_opt(iter_opt)?, start_key, f) - } - - // like `scan`, only on a specific column family. - fn scan_cf( + fn scan( &self, cf: &str, start_key: &[u8], @@ -166,27 +139,14 @@ pub trait Iterable { where F: FnMut(&[u8], &[u8]) -> Result, { - let start = KeyBuilder::from_slice(start_key, DATA_KEY_PREFIX_LEN, 0); - let end = - (!end_key.is_empty()).then(|| KeyBuilder::from_slice(end_key, DATA_KEY_PREFIX_LEN, 0)); - let iter_opt = IterOptions::new(Some(start), end, fill_cache); - scan_impl(self.iterator_cf_opt(cf, iter_opt)?, start_key, f) + let iter_opt = iter_option(start_key, end_key, fill_cache); + scan_impl(self.iterator_opt(cf, iter_opt)?, start_key, f) } // Seek the first key >= given key, if not found, return None. - fn seek(&self, key: &[u8]) -> Result, Vec)>> { - let mut iter = self.iterator()?; - if iter.seek(SeekKey::Key(key))? { - let (k, v) = (iter.key().to_vec(), iter.value().to_vec()); - return Ok(Some((k, v))); - } - Ok(None) - } - - // Seek the first key >= given key, if not found, return None. - fn seek_cf(&self, cf: &str, key: &[u8]) -> Result, Vec)>> { - let mut iter = self.iterator_cf(cf)?; - if iter.seek(SeekKey::Key(key))? { + fn seek(&self, cf: &str, key: &[u8]) -> Result, Vec)>> { + let mut iter = self.iterator(cf)?; + if iter.seek(key)? { return Ok(Some((iter.key().to_vec(), iter.value().to_vec()))); } Ok(None) @@ -198,19 +158,13 @@ where Iter: Iterator, F: FnMut(&[u8], &[u8]) -> Result, { - let mut remained = it.seek(SeekKey::Key(start_key))?; + let mut remained = it.seek(start_key)?; while remained { remained = f(it.key(), it.value())? && it.next()?; } Ok(()) } -impl<'a> From<&'a [u8]> for SeekKey<'a> { - fn from(bs: &'a [u8]) -> SeekKey<'a> { - SeekKey::Key(bs) - } -} - /// Collect all items of `it` into a vector, generally used for tests. /// /// # Panics @@ -226,3 +180,15 @@ pub fn collect(mut it: I) -> Vec<(Vec, Vec)> { } v } + +/// Build an `IterOptions` using giving data key bound. Empty upper bound will +/// be ignored. +pub fn iter_option(lower_bound: &[u8], upper_bound: &[u8], fill_cache: bool) -> IterOptions { + let lower_bound = Some(KeyBuilder::from_slice(lower_bound, 0, 0)); + let upper_bound = if upper_bound.is_empty() { + None + } else { + Some(KeyBuilder::from_slice(upper_bound, 0, 0)) + }; + IterOptions::new(lower_bound, upper_bound, fill_cache) +} diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index c5b09fe59e1..45a3d18fa7a 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -60,14 +60,15 @@ //! - [`SyncMutable`] and [`Mutable`] - types to which single key/value pairs //! can be written. This includes engines and write batches. //! -//! - [`WriteBatch`] - types that can commit multiple key/value pairs in batches. -//! A `WriteBatchExt::WriteBtach` commits all pairs in one atomic transaction. -//! A `WriteBatchExt::WriteBatchVec` does not (FIXME: is this correct?). +//! - [`WriteBatch`] - types that can commit multiple key/value pairs in +//! batches. A `WriteBatchExt::WriteBatch` commits all pairs in one atomic +//! transaction. A `WriteBatchExt::WriteBatchVec` does not (FIXME: is this +//! correct?). //! //! The `KvEngine` instance generally acts as a factory for types that implement //! other traits in the crate. These factory methods, associated types, and -//! other associated methods are defined in "extension" traits. For example, methods -//! on engines related to batch writes are in the `WriteBatchExt` trait. +//! other associated methods are defined in "extension" traits. For example, +//! methods on engines related to batch writes are in the `WriteBatchExt` trait. //! //! //! # Design notes @@ -75,19 +76,19 @@ //! - `KvEngine` is the main engine trait. It requires many other traits, which //! have many other associated types that implement yet more traits. //! -//! - Features should be grouped into their own modules with their own -//! traits. A common pattern is to have an associated type that implements -//! a trait, and an "extension" trait that associates that type with `KvEngine`, -//! which is part of `KvEngine's trait requirements. +//! - Features should be grouped into their own modules with their own traits. A +//! common pattern is to have an associated type that implements a trait, and +//! an "extension" trait that associates that type with `KvEngine`, which is +//! part of `KvEngine's trait requirements. //! //! - For now, for simplicity, all extension traits are required by `KvEngine`. //! In the future it may be feasible to separate them for engines with //! different feature sets. //! -//! - Associated types generally have the same name as the trait they -//! are required to implement. Engine extensions generally have the same -//! name suffixed with `Ext`. Concrete implementations usually have the -//! same name prefixed with the database name, i.e. `Rocks`. +//! - Associated types generally have the same name as the trait they are +//! required to implement. Engine extensions generally have the same name +//! suffixed with `Ext`. Concrete implementations usually have the same name +//! prefixed with the database name, i.e. `Rocks`. //! //! Example: //! @@ -121,9 +122,9 @@ //! use a standard new method). If future engines require factory methods, the //! traits can be converted then. //! -//! - Types that require a handle to the engine (or some other "parent" type) -//! do so with either Rc or Arc. An example is EngineIterator. The reason -//! for this is that associated types cannot contain lifetimes. That requires +//! - Types that require a handle to the engine (or some other "parent" type) do +//! so with either Rc or Arc. An example is EngineIterator. The reason for +//! this is that associated types cannot contain lifetimes. That requires //! "generic associated types". See //! //! - @@ -190,7 +191,7 @@ //! //! At the end of this phase the `engine` crate will be deleted. //! -//! ## 3) "Pulling up" the generic abstractions through TiKv +//! ## 3) "Pulling up" the generic abstractions through TiKV //! //! With all of TiKV using the `engine_traits` traits in conjunction with the //! concrete `engine_rocks` types, we can push generic type parameters up @@ -221,15 +222,15 @@ //! `RocksDB::from_ref` and `RocksDB::as_inner` methods. //! //! - Down follow the type system too far "down the rabbit hole". When you see -//! that another subsystem is blocking you from refactoring the system you -//! are trying to refactor, stop, stash your changes, and focus on the other +//! that another subsystem is blocking you from refactoring the system you are +//! trying to refactor, stop, stash your changes, and focus on the other //! system instead. //! //! - You will through away branches that lead to dead ends. Learn from the //! experience and try again from a different angle. //! -//! - For now, use the same APIs as the RocksDB bindings, as methods -//! on the various engine traits, and with this crate's error type. +//! - For now, use the same APIs as the RocksDB bindings, as methods on the +//! various engine traits, and with this crate's error type. //! //! - When new types are needed from the RocksDB API, add a new module, define a //! new trait (possibly with the same name as the RocksDB type), then define a @@ -239,10 +240,6 @@ //! it in engine_traits and engine_rocks, replacing all the callers with calls //! into the traits, then delete the versions in the `engine` crate. //! -//! - Use the .c() method from engine_rocks::compat::Compat to get a -//! KvEngine reference from Arc in the fewest characters. It also -//! works on Snapshot, and can be adapted to other types. -//! //! - Use `IntoOther` to adapt between error types of dependencies that are not //! themselves interdependent. E.g. raft::Error can be created from //! engine_traits::Error even though neither `raft` tor `engine_traits` know @@ -251,11 +248,17 @@ //! - "Plain old data" types in `engine` can be moved directly into //! `engine_traits` and reexported from `engine` to ease the transition. //! Likewise `engine_rocks` can temporarily call code from inside `engine`. +#![cfg_attr(test, feature(test))] #![feature(min_specialization)] #![feature(assert_matches)] +#![feature(linked_list_cursors)] +#![feature(let_chains)] +#![feature(str_split_as_str)] #[macro_use(fail_point)] extern crate fail; +#[cfg(test)] +extern crate test; // These modules contain traits that need to be implemented by engines, either // they are required by KvEngine or are an associated type of KvEngine. It is @@ -277,6 +280,8 @@ mod engine; pub use crate::engine::*; mod file_system; pub use crate::file_system::*; +mod flush; +pub use flush::*; mod import; pub use import::*; mod misc; @@ -294,6 +299,8 @@ mod sst_partitioner; pub use crate::sst_partitioner::*; mod range_properties; pub use crate::{mvcc_properties::*, range_properties::*}; +mod tablet; +pub use tablet::*; mod ttl_properties; pub use crate::ttl_properties::*; mod perf_context; @@ -302,6 +309,8 @@ mod flow_control_factors; pub use crate::flow_control_factors::*; mod table_properties; pub use crate::table_properties::*; +mod checkpoint; +pub use crate::checkpoint::*; // These modules contain more general traits, some of which may be implemented // by multiple types. @@ -331,7 +340,7 @@ pub use crate::range::*; mod raft_engine; pub use raft_engine::{ - CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RaftLogGCTask, + CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RAFT_LOG_MULTI_GET_CNT, }; diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index bc2c3a2b547..c2d317f529f 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -6,31 +6,65 @@ //! FIXME: Things here need to be moved elsewhere. use crate::{ - cf_names::CFNamesExt, errors::Result, flow_control_factors::FlowControlFactorsExt, range::Range, + cf_names::CfNamesExt, errors::Result, flow_control_factors::FlowControlFactorsExt, range::Range, }; #[derive(Clone, Debug)] pub enum DeleteStrategy { - /// Delete the SST files that are fullly fit in range. However, the SST files that are partially - /// overlapped with the range will not be touched. + /// Delete the SST files that are fullly fit in range. However, the SST + /// files that are partially overlapped with the range will not be + /// touched. + /// + /// Note: + /// - After this operation, some keys in the range might still exist in + /// the database. + /// - After this operation, some keys in the range might be removed from + /// existing snapshot, so you shouldn't expect to be able to read data + /// from the range using existing snapshots any more. + /// + /// Ref: DeleteFiles, /// Delete the data stored in Titan. DeleteBlobs, - /// Scan for keys and then delete. Useful when we know the keys in range are not too many. + /// Scan for keys and then delete. Useful when we know the keys in range are + /// not too many. DeleteByKey, - /// Delete by range. Note that this is experimental and you should check whether it is enbaled - /// in config before using it. + /// Delete by range. Note that this is experimental and you should check + /// whether it is enbaled in config before using it. DeleteByRange, - /// Delete by ingesting a SST file with deletions. Useful when the number of ranges is too many. + /// Delete by ingesting a SST file with deletions. Useful when the number of + /// ranges is too many. DeleteByWriter { sst_path: String }, } -pub trait MiscExt: CFNamesExt + FlowControlFactorsExt { - fn flush(&self, sync: bool) -> Result<()>; +/// `StatisticsReporter` can be used to report engine's private statistics to +/// prometheus metrics. For one single engine, using it is equivalent to calling +/// `KvEngine::flush_metrics("name")`. For multiple engines, it can aggregate +/// statistics accordingly. +/// Note that it is not responsible for managing the statistics from +/// user-provided collectors that are potentially shared between engines. +pub trait StatisticsReporter { + fn new(name: &str) -> Self; + + /// Collect statistics from one single engine. + fn collect(&mut self, engine: &T); + + /// Aggregate and report statistics to prometheus metrics counters. The + /// statistics are not cleared afterwards. + fn flush(&mut self); +} + +pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { + type StatisticsReporter: StatisticsReporter; + + /// Flush all specified column families at once. + /// + /// If `cfs` is empty, it will try to flush all available column families. + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()>; - fn flush_cf(&self, cf: &str, sync: bool) -> Result<()>; + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()>; - fn delete_all_in_range(&self, strategy: DeleteStrategy, ranges: &[Range<'_>]) -> Result<()> { + fn delete_ranges_cfs(&self, strategy: DeleteStrategy, ranges: &[Range<'_>]) -> Result<()> { for cf in self.cf_names() { self.delete_ranges_cf(cf, strategy.clone(), ranges)?; } @@ -44,37 +78,36 @@ pub trait MiscExt: CFNamesExt + FlowControlFactorsExt { ranges: &[Range<'_>], ) -> Result<()>; - /// Return the approximate number of records and size in the range of memtables of the cf. + /// Return the approximate number of records and size in the range of + /// memtables of the cf. fn get_approximate_memtable_stats_cf(&self, cf: &str, range: &Range<'_>) -> Result<(u64, u64)>; fn ingest_maybe_slowdown_writes(&self, cf: &str) -> Result; + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>>; + /// Gets total used size of rocksdb engine, including: - /// * total size (bytes) of all SST files. - /// * total size (bytes) of active and unflushed immutable memtables. - /// * total size (bytes) of all blob files. - /// + /// * total size (bytes) of all SST files. + /// * total size (bytes) of active and unflushed immutable memtables. + /// * total size (bytes) of all blob files. fn get_engine_used_size(&self) -> Result; - /// Roughly deletes files in multiple ranges. - /// - /// Note: - /// - After this operation, some keys in the range might still exist in the database. - /// - After this operation, some keys in the range might be removed from existing snapshot, - /// so you shouldn't expect to be able to read data from the range using existing snapshots - /// any more. - /// - /// Ref: - fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()>; - /// The path to the directory on the filesystem where the database is stored fn path(&self) -> &str; fn sync_wal(&self) -> Result<()>; + /// Depending on the implementation, some on-going manual compactions may be + /// aborted. + fn pause_background_work(&self) -> Result<()>; + + fn continue_background_work(&self) -> Result<()>; + /// Check whether a database exists at a given path fn exists(path: &str) -> bool; + fn locked(path: &str) -> Result; + /// Dump stats about the database into a string. /// /// For debugging. The format and content is unspecified. @@ -86,6 +119,8 @@ pub trait MiscExt: CFNamesExt + FlowControlFactorsExt { fn get_total_sst_files_size_cf(&self, cf: &str) -> Result>; + fn get_num_keys(&self) -> Result; + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_traits/src/options.rs b/components/engine_traits/src/options.rs index 563bf24f206..04500407d90 100644 --- a/components/engine_traits/src/options.rs +++ b/components/engine_traits/src/options.rs @@ -34,6 +34,7 @@ impl Default for ReadOptions { pub struct WriteOptions { sync: bool, no_slowdown: bool, + disable_wal: bool, } impl WriteOptions { @@ -41,6 +42,7 @@ impl WriteOptions { WriteOptions { sync: false, no_slowdown: false, + disable_wal: false, } } @@ -59,6 +61,14 @@ impl WriteOptions { pub fn no_slowdown(&self) -> bool { self.no_slowdown } + + pub fn set_disable_wal(&mut self, disable_wal: bool) { + self.disable_wal = disable_wal; + } + + pub fn disable_wal(&self) -> bool { + self.disable_wal + } } #[derive(Clone, PartialEq)] diff --git a/components/engine_traits/src/peekable.rs b/components/engine_traits/src/peekable.rs index 7550568396c..fe9e3600abe 100644 --- a/components/engine_traits/src/peekable.rs +++ b/components/engine_traits/src/peekable.rs @@ -10,16 +10,17 @@ use crate::*; /// to read from, or to encode the value as a protobuf message. pub trait Peekable { /// The byte-vector type through which the database returns read values. - type DBVector: DBVector; + type DbVector: DbVector; /// Read a value for a key, given a set of options. /// /// Reads from the default column family. /// /// Returns `None` if they key does not exist. - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result>; + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result>; - /// Read a value for a key from a given column family, given a set of options. + /// Read a value for a key from a given column family, given a set of + /// options. /// /// Returns `None` if the key does not exist. fn get_value_cf_opt( @@ -27,14 +28,14 @@ pub trait Peekable { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result>; + ) -> Result>; /// Read a value for a key. /// /// Uses the default options and column family. /// /// Returns `None` if the key does not exist. - fn get_value(&self, key: &[u8]) -> Result> { + fn get_value(&self, key: &[u8]) -> Result> { self.get_value_opt(&ReadOptions::default(), key) } @@ -43,7 +44,7 @@ pub trait Peekable { /// Uses the default options. /// /// Returns `None` if the key does not exist. - fn get_value_cf(&self, cf: &str, key: &[u8]) -> Result> { + fn get_value_cf(&self, cf: &str, key: &[u8]) -> Result> { self.get_value_cf_opt(&ReadOptions::default(), cf, key) } diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index f213925ddbd..44462e3fe3c 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -1,5 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use tikv_util::numeric_enum_serializing_mod; +use tracker::TrackerToken; #[derive(Copy, Clone, Debug, PartialEq)] pub enum PerfLevel { @@ -7,7 +8,7 @@ pub enum PerfLevel { Disable, EnableCount, EnableTimeExceptForMutex, - EnableTimeAndCPUTimeExceptForMutex, + EnableTimeAndCpuTimeExceptForMutex, EnableTime, OutOfBounds, } @@ -17,7 +18,7 @@ numeric_enum_serializing_mod! {perf_level_serde PerfLevel { Disable = 1, EnableCount = 2, EnableTimeExceptForMutex = 3, - EnableTimeAndCPUTimeExceptForMutex = 4, + EnableTimeAndCpuTimeExceptForMutex = 4, EnableTime = 5, OutOfBounds = 6, }} @@ -36,18 +37,22 @@ numeric_enum_serializing_mod! {perf_level_serde PerfLevel { pub trait PerfContextExt { type PerfContext: PerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; } /// The subsystem the PerfContext is being created for. /// /// This is a leaky abstraction that supports the encapsulation of metrics /// reporting by the subsystems that use PerfContext. -#[derive(Eq, PartialEq, Copy, Clone, Debug)] +#[derive(PartialEq, Copy, Clone, Debug)] pub enum PerfContextKind { RaftstoreApply, RaftstoreStore, - GenericRead, + /// Commands in tikv::storage, the inner str is the command tag. + Storage(&'static str), + /// Coprocessor requests in tikv::coprocessor, the inner str is the request + /// type. + Coprocessor(&'static str), } /// Reports metrics to prometheus @@ -58,6 +63,6 @@ pub trait PerfContext: Send { /// Reinitializes statistics and the perf level fn start_observe(&mut self); - /// Reports the current collected metrics to prometheus - fn report_metrics(&mut self); + /// Reports the current collected metrics to prometheus and trackers + fn report_metrics(&mut self, trackers: &[TrackerToken]); } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index a0697218cf7..671fed8b3cf 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -1,6 +1,11 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use kvproto::raft_serverpb::RaftLocalState; +use kvproto::{ + metapb::Region, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, +}; use raft::eraftpb::Entry; use crate::*; @@ -8,7 +13,28 @@ use crate::*; pub const RAFT_LOG_MULTI_GET_CNT: u64 = 8; pub trait RaftEngineReadOnly: Sync + Send + 'static { + fn is_empty(&self) -> Result; + + fn get_store_ident(&self) -> Result>; + fn get_prepare_bootstrap_region(&self) -> Result>; + fn get_raft_state(&self, raft_group_id: u64) -> Result>; + /// Get the latest region state not after the apply index. + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result>; + /// Get the latest apply state not after the apply index. + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result>; + /// Get the flushed index of the given CF. + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result>; + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result; + fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -41,7 +67,7 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { Ok(true) }) .unwrap(); - batch.append(region_id, entries).unwrap(); + batch.append(region_id, None, entries).unwrap(); if let Some(state) = self.get_raft_state(region_id).unwrap() { batch.put_raft_state(region_id, &state).unwrap(); } @@ -49,13 +75,8 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { } } -pub struct RaftLogGCTask { - pub raft_group_id: u64, - pub from: u64, - pub to: u64, -} - -pub trait RaftEngine: RaftEngineReadOnly + Clone + Sync + Send + 'static { +// TODO: Refactor common methods between Kv and Raft engine into a shared trait. +pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send + 'static { type LogBatch: RaftLogBatch; fn log_batch(&self, capacity: usize) -> Self::LogBatch; @@ -84,58 +105,111 @@ pub trait RaftEngine: RaftEngineReadOnly + Clone + Sync + Send + 'static { batch: &mut Self::LogBatch, ) -> Result<()>; - /// Append some log entries and return written bytes. - /// - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&self, raft_group_id: u64, entries: Vec) -> Result; - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; + /// Like `cut_logs` but the range could be very large. + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()>; - /// Like `cut_logs` but the range could be very large. Return the deleted count. - /// Generally, `from` can be passed in `0`. - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result; + /// Delete all but the latest one of states that are associated with smaller + /// apply_index. + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()>; - fn batch_gc(&self, tasks: Vec) -> Result { - let mut total = 0; - for task in tasks { - total += self.gc(task.raft_group_id, task.from, task.to)?; - } - Ok(total) + fn need_manual_purge(&self) -> bool { + false } /// Purge expired logs files and return a set of Raft group ids /// which needs to be compacted ASAP. - fn purge_expired_files(&self) -> Result>; - - /// The `RaftEngine` has a builtin entry cache or not. - fn has_builtin_entry_cache(&self) -> bool { - false + fn manual_purge(&self) -> Result> { + unimplemented!() } - /// GC the builtin entry cache. - fn gc_entry_cache(&self, _raft_group_id: u64, _to: u64) {} - fn flush_metrics(&self, _instance: &str) {} fn flush_stats(&self) -> Option { None } - fn reset_statistics(&self) {} fn stop(&self) {} fn dump_stats(&self) -> Result; fn get_engine_size(&self) -> Result; + + /// The path to the directory on the filesystem where the raft log is stored + fn get_engine_path(&self) -> &str; + + /// Visit all available raft groups. + /// + /// If any error is returned, the iteration will stop. + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From; } pub trait RaftLogBatch: Send { - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()>; + /// Append continuous entries to the batch. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. + /// Nothing will be deleted if entries is empty. Note: `RaftLocalState` + /// won't be updated in this call. + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()>; + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()>; - /// Remove Raft logs in [`from`, `to`) which will be overwritten later. - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64); + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()>; + fn remove_prepare_bootstrap_region(&mut self) -> Result<()>; fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()>; + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()>; + + /// Record the flushed apply index. + /// + /// There are two types of apply index: + /// 1. Normal apply index that only related to single tablet. These apply + /// indexes are recorded using its own CF. + /// 2. Apply index that can affect other tablets, like split, merge. These + /// apply indexes are recorded using special Raft CF. + /// + /// Because a peer may have multiple tablets (only one is latest), we use + /// `tablet_index` to avoid conflicts. + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()>; + + /// Mark a tablet may contain data that is not supposed to be in its range. + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()>; + + /// Indicate whether region states should be recovered from raftdb and + /// replay raft logs. + /// When kvdb's write-ahead-log is disabled, the sequence number of the last + /// boot time is saved. + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()>; /// The data size of this RaftLogBatch. fn persist_size(&self) -> usize; diff --git a/components/engine_traits/src/range_properties.rs b/components/engine_traits/src/range_properties.rs index 8c326bd41c7..f97008dd929 100644 --- a/components/engine_traits/src/range_properties.rs +++ b/components/engine_traits/src/range_properties.rs @@ -32,7 +32,8 @@ pub trait RangePropertiesExt { large_threshold: u64, ) -> Result; - /// Get range approximate split keys to split range evenly into key_count + 1 parts . + /// Get range approximate split keys to split range evenly into key_count + + /// 1 parts . fn get_range_approximate_split_keys( &self, range: Range<'_>, diff --git a/components/engine_traits/src/snapshot.rs b/components/engine_traits/src/snapshot.rs index 93ef451209c..a5829161e25 100644 --- a/components/engine_traits/src/snapshot.rs +++ b/components/engine_traits/src/snapshot.rs @@ -2,7 +2,7 @@ use std::fmt::Debug; -use crate::{iterable::Iterable, peekable::Peekable}; +use crate::{iterable::Iterable, peekable::Peekable, CfNamesExt}; /// A consistent read-only view of the database. /// @@ -10,7 +10,6 @@ use crate::{iterable::Iterable, peekable::Peekable}; /// clonable, call `into_sync` to create a `SyncSnapshot`. pub trait Snapshot where - Self: 'static + Peekable + Iterable + Send + Sync + Sized + Debug, + Self: 'static + Peekable + Iterable + CfNamesExt + Send + Sync + Sized + Debug, { - fn cf_names(&self) -> Vec<&str>; } diff --git a/components/engine_traits/src/sst.rs b/components/engine_traits/src/sst.rs index fb37c918886..ea08df3bb50 100644 --- a/components/engine_traits/src/sst.rs +++ b/components/engine_traits/src/sst.rs @@ -4,7 +4,7 @@ use std::path::PathBuf; use kvproto::import_sstpb::SstMeta; -use crate::{errors::Result, iterable::Iterable}; +use crate::{errors::Result, RefIterable}; #[derive(Clone, Debug)] pub struct SstMetaInfo { @@ -20,17 +20,15 @@ pub trait SstExt: Sized { } /// SstReader is used to read an SST file. -pub trait SstReader: Iterable + Sized { +pub trait SstReader: RefIterable + Sized { fn open(path: &str) -> Result; fn verify_checksum(&self) -> Result<()>; - // FIXME: Shouldn't this me a method on Iterable? - fn iter(&self) -> Self::Iterator; } /// SstWriter is used to create sst files that can be added to database later. pub trait SstWriter: Send { type ExternalSstFileInfo: ExternalSstFileInfo; - type ExternalSstFileReader: std::io::Read; + type ExternalSstFileReader: std::io::Read + Send; /// Add key, value to currently opened file /// REQUIRES: key is after any previously added key according to comparator. diff --git a/components/engine_traits/src/sst_partitioner.rs b/components/engine_traits/src/sst_partitioner.rs index faedd4efb8b..bc6ec13a4eb 100644 --- a/components/engine_traits/src/sst_partitioner.rs +++ b/components/engine_traits/src/sst_partitioner.rs @@ -2,20 +2,20 @@ use std::ffi::CString; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub struct SstPartitionerRequest<'a> { pub prev_user_key: &'a [u8], pub current_user_key: &'a [u8], pub current_output_file_size: u64, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub enum SstPartitionerResult { NotRequired, Required, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub struct SstPartitionerContext<'a> { pub is_full_compaction: bool, pub is_manual_compaction: bool, @@ -30,8 +30,8 @@ pub trait SstPartitioner { } pub trait SstPartitionerFactory: Sync + Send { - // Lifetime of the partitioner can be changed to be bounded by the factory's lifetime once - // generic associated types is supported. + // Lifetime of the partitioner can be changed to be bounded by the factory's + // lifetime once generic associated types is supported. // https://github.com/rust-lang/rfcs/blob/master/text/1598-generic_associated_types.md type Partitioner: SstPartitioner + 'static; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs new file mode 100644 index 00000000000..14f7d186f76 --- /dev/null +++ b/components/engine_traits/src/tablet.rs @@ -0,0 +1,493 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Debug, Formatter}, + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, +}; + +use collections::HashMap; +use kvproto::metapb::Region; +use tikv_util::box_err; + +#[cfg(any(test, feature = "testexport"))] +use crate::StateStorage; +use crate::{Error, FlushState, Result}; + +#[derive(Debug)] +struct LatestTablet { + data: Mutex>, + version: AtomicU64, +} + +/// Tablet may change during split, merge and applying snapshot. So we need a +/// shared value to reflect the latest tablet. `CachedTablet` provide cache that +/// can speed up common access. +#[derive(Clone, Debug)] +pub struct CachedTablet { + latest: Arc>, + cache: Option, + version: u64, +} + +impl CachedTablet { + fn release(&mut self) { + self.cache = None; + self.version = 0; + } +} + +impl CachedTablet { + #[inline] + fn new(data: Option) -> Self { + CachedTablet { + latest: Arc::new(LatestTablet { + data: Mutex::new(data.clone()), + version: AtomicU64::new(1), + }), + cache: data, + // We use 0 in release, so it needs to be intialized to 1. + version: 1, + } + } + + pub fn set(&mut self, data: EK) -> Option { + self.cache = Some(data.clone()); + let mut latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.fetch_add(1, Ordering::Relaxed) + 1; + latest_data.replace(data) + } + + /// Get the tablet from cache without checking if it's up to date. + #[inline] + pub fn cache(&self) -> Option<&EK> { + self.cache.as_ref() + } + + /// Get the latest tablet. + #[inline] + pub fn latest(&mut self) -> Option<&EK> { + if self.latest.version.load(Ordering::Relaxed) > self.version { + let latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.load(Ordering::Relaxed); + self.cache = latest_data.clone(); + } + self.cache() + } +} + +/// Context to be passed to `TabletFactory`. +#[derive(Clone)] +pub struct TabletContext { + /// ID of the tablet. It is usually the region ID. + pub id: u64, + /// Suffix the tablet. It is usually the index that the tablet starts accept + /// incremental modification. The reason to have suffix is that we can keep + /// more than one tablet for a region. + pub suffix: Option, + /// The expected start key of the tablet. The key should be in the format + /// tablet is actually stored, for example should have `z` prefix. + /// + /// Any key that is smaller than this key can be considered obsolete. + pub start_key: Box<[u8]>, + /// The expected end key of the tablet. The key should be in the format + /// tablet is actually stored, for example should have `z` prefix. + /// + /// Any key that is larger than or equal to this key can be considered + /// obsolete. + pub end_key: Box<[u8]>, + /// The states to be persisted when flush is triggered. + /// + /// If not set, apply may not be resumed correctly. + pub flush_state: Option>, +} + +impl Debug for TabletContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("TabletContext") + .field("id", &self.id) + .field("suffix", &self.suffix) + .field("start_key", &log_wrappers::Value::key(&self.start_key)) + .field("end_key", &log_wrappers::Value::key(&self.end_key)) + .finish() + } +} + +impl TabletContext { + pub fn new(region: &Region, suffix: Option) -> Self { + TabletContext { + id: region.get_id(), + suffix, + start_key: keys::data_key(region.get_start_key()).into_boxed_slice(), + end_key: keys::data_end_key(region.get_end_key()).into_boxed_slice(), + flush_state: None, + } + } + + /// Create a context that assumes there is only one region and it covers the + /// whole key space. Normally you should only use this in tests. + pub fn with_infinite_region(id: u64, suffix: Option) -> Self { + let mut region = Region::default(); + region.set_id(id); + Self::new(®ion, suffix) + } +} + +/// A factory trait to create new tablet for multi-rocksdb architecture. +// It should be named as `EngineFactory` for consistency, but we are about to +// rename engine to tablet, so always use tablet for new traits/types. +pub trait TabletFactory: Send + Sync { + /// Open the tablet in `path`. + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result; + + /// Destroy the tablet and its data + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()>; + + /// Check if the tablet with specified path exists + fn exists(&self, path: &Path) -> bool; + + #[cfg(feature = "testexport")] + fn set_state_storage(&self, _: Arc) { + unimplemented!() + } +} + +pub struct SingletonFactory { + tablet: EK, +} + +impl SingletonFactory { + pub fn new(tablet: EK) -> Self { + SingletonFactory { tablet } + } +} + +impl TabletFactory for SingletonFactory { + /// Open the tablet in `path`. + /// + /// `id` and `suffix` is used to mark the identity of tablet. The id is + /// likely the region Id, the suffix could be the current raft log + /// index. The reason to have suffix is that we can keep more than one + /// tablet for a region. + fn open_tablet(&self, _ctx: TabletContext, _path: &Path) -> Result { + Ok(self.tablet.clone()) + } + + /// Destroy the tablet and its data + fn destroy_tablet(&self, _ctx: TabletContext, _path: &Path) -> Result<()> { + Ok(()) + } + + /// Check if the tablet with specified path exists + fn exists(&self, _path: &Path) -> bool { + true + } +} + +/// A global registry for all tablets. +struct TabletRegistryInner { + // region_id, suffix -> tablet + tablets: Mutex>>, + factory: Box>, + root: PathBuf, +} + +pub struct TabletRegistry { + // One may consider to add cache to speed up access. But it also makes it more + // difficult to gc stale cache. + tablets: Arc>, +} + +impl Clone for TabletRegistry { + fn clone(&self) -> Self { + Self { + tablets: self.tablets.clone(), + } + } +} + +impl TabletRegistry { + pub fn new(factory: Box>, path: impl Into) -> Result { + let root = path.into(); + std::fs::create_dir_all(&root)?; + Ok(TabletRegistry { + tablets: Arc::new(TabletRegistryInner { + tablets: Mutex::new(HashMap::default()), + factory, + root, + }), + }) + } + + /// Format the name as {prefix}_{id}_{suffix}. If prefix is empty, it will + /// be format as {id}_{suffix}. + pub fn tablet_name(&self, prefix: &str, id: u64, suffix: u64) -> String { + format!( + "{}{:_(&self, path: &'a Path) -> Option<(&'a str, u64, u64)> { + let name = path.file_name().unwrap().to_str().unwrap(); + let mut parts = name.rsplit('_'); + let suffix = parts.next()?.parse().ok()?; + let id = parts.next()?.parse().ok()?; + let prefix = parts.as_str(); + Some((prefix, id, suffix)) + } + + pub fn tablet_root(&self) -> &Path { + &self.tablets.root + } + + pub fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + let name = self.tablet_name("", id, suffix); + self.tablets.root.join(name) + } + + /// Gets a tablet. + pub fn get(&self, id: u64) -> Option> + where + EK: Clone, + { + let tablets = self.tablets.tablets.lock().unwrap(); + tablets.get(&id).cloned() + } + + /// Gets a tablet, create a default one if it doesn't exist. + pub fn get_or_default(&self, id: u64) -> CachedTablet + where + EK: Clone, + { + let mut tablets = self.tablets.tablets.lock().unwrap(); + tablets + .entry(id) + .or_insert_with(|| CachedTablet::new(None)) + .clone() + } + + pub fn tablet_factory(&self) -> &dyn TabletFactory { + self.tablets.factory.as_ref() + } + + pub fn remove(&self, id: u64) { + self.tablets.tablets.lock().unwrap().remove(&id); + } + + /// Load the tablet and set it as the latest. + /// + /// If the tablet doesn't exist, it will create an empty one. + pub fn load(&self, ctx: TabletContext, create: bool) -> Result> + where + EK: Clone, + { + assert!(ctx.suffix.is_some()); + let id = ctx.id; + let path = self.tablet_path(id, ctx.suffix.unwrap()); + if !create && !self.tablets.factory.exists(&path) { + return Err(Error::Other(box_err!( + "tablet ({}, {:?}) doesn't exist", + id, + ctx.suffix + ))); + } + // TODO: use compaction filter to trim range. + let tablet = self.tablets.factory.open_tablet(ctx, &path)?; + let mut cached = self.get_or_default(id); + cached.set(tablet); + Ok(cached) + } + + /// Loop over all opened tablets. Note, it's possible that the visited + /// tablet is not the latest one. If latest one is required, you may + /// either: + /// - loop several times to make it likely to visit all tablets. + /// - send commands to fsms instead, which can guarantee latest tablet is + /// visisted. + pub fn for_each_opened_tablet(&self, mut f: impl FnMut(u64, &mut CachedTablet) -> bool) { + let mut tablets = self.tablets.tablets.lock().unwrap(); + for (id, tablet) in tablets.iter_mut() { + if !f(*id, tablet) { + tablet.release(); + return; + } + tablet.release(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cached_tablet() { + let mut cached_tablet = CachedTablet::new(None); + assert_eq!(cached_tablet.cache(), None); + assert_eq!(cached_tablet.latest(), None); + + cached_tablet = CachedTablet::new(Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + + // Setting tablet will refresh cache immediately. + cached_tablet.set(2); + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + + // Test `latest()` will use cache. + // Unsafe modify the data. + let old_data = *cached_tablet.latest.data.lock().unwrap(); + *cached_tablet.latest.data.lock().unwrap() = Some(0); + assert_eq!(cached_tablet.latest().cloned(), old_data); + // Restore the data. + *cached_tablet.latest.data.lock().unwrap() = old_data; + + let mut cloned = cached_tablet.clone(); + // Clone should reuse cache. + assert_eq!(cloned.cache().cloned(), Some(2)); + cloned.set(1); + assert_eq!(cloned.cache().cloned(), Some(1)); + assert_eq!(cloned.latest().cloned(), Some(1)); + + // Local cache won't be refreshed until querying latest. + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + } + + #[test] + fn test_singleton_factory() { + let tablet = Arc::new(1); + let singleton = SingletonFactory::new(tablet.clone()); + let registry = TabletRegistry::new(Box::new(singleton), "").unwrap(); + let mut ctx = TabletContext::with_infinite_region(1, Some(1)); + registry.load(ctx.clone(), true).unwrap(); + let mut cached = registry.get(1).unwrap(); + assert_eq!(cached.latest().cloned(), Some(tablet.clone())); + + ctx.id = 2; + registry.load(ctx.clone(), true).unwrap(); + let mut count = 0; + registry.for_each_opened_tablet(|id, cached| { + assert!(&[1, 2].contains(&id), "{}", id); + assert_eq!(cached.latest().cloned(), Some(tablet.clone())); + count += 1; + true + }); + assert_eq!(count, 2); + + // Destroy should be ignored. + registry + .tablet_factory() + .destroy_tablet(ctx.clone(), ®istry.tablet_path(2, 1)) + .unwrap(); + + // Exist check should always succeed. + ctx.id = 3; + registry.load(ctx, false).unwrap(); + let mut cached = registry.get(3).unwrap(); + assert_eq!(cached.latest().cloned(), Some(tablet)); + } + + type Record = Arc<(u64, u64)>; + + struct MemoryTablet { + tablet: Mutex>, + } + + impl TabletFactory for MemoryTablet { + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { + let mut tablet = self.tablet.lock().unwrap(); + if tablet.contains_key(path) { + return Err(Error::Other(box_err!("tablet is opened"))); + } + tablet.insert(path.to_owned(), Arc::new((ctx.id, ctx.suffix.unwrap_or(0)))); + Ok(tablet[path].clone()) + } + + fn exists(&self, path: &Path) -> bool { + let tablet = self.tablet.lock().unwrap(); + tablet.contains_key(path) + } + + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { + let prev = self.tablet.lock().unwrap().remove(path).unwrap(); + assert_eq!((ctx.id, ctx.suffix.unwrap_or(0)), *prev); + Ok(()) + } + } + + #[test] + fn test_tablet_registry() { + let factory = MemoryTablet { + tablet: Mutex::new(HashMap::default()), + }; + let registry = TabletRegistry::new(Box::new(factory), "").unwrap(); + + let mut ctx = TabletContext::with_infinite_region(1, Some(10)); + let mut tablet_1_10 = registry.load(ctx.clone(), true).unwrap(); + // It's open already, load it twice should report lock error. + registry.load(ctx.clone(), true).unwrap_err(); + let mut cached = registry.get(1).unwrap(); + assert_eq!(cached.latest(), tablet_1_10.latest()); + + let tablet_path = registry.tablet_path(1, 10); + assert!(registry.tablet_factory().exists(&tablet_path)); + + let tablet_path = registry.tablet_path(1, 11); + assert!(!registry.tablet_factory().exists(&tablet_path)); + // Not exist tablet should report error. + ctx.suffix = Some(11); + registry.load(ctx.clone(), false).unwrap_err(); + assert!(registry.get(2).is_none()); + // Though path not exist, but we should be able to create an empty one. + assert_eq!(registry.get_or_default(2).latest(), None); + assert!(!registry.tablet_factory().exists(&tablet_path)); + + // Load new suffix should update cache. + registry.load(ctx, true).unwrap(); + assert_ne!(cached.latest(), tablet_1_10.cache()); + let tablet_path = registry.tablet_path(1, 11); + assert!(registry.tablet_factory().exists(&tablet_path)); + + let mut count = 0; + registry.for_each_opened_tablet(|_, _| { + count += 1; + true + }); + assert_eq!(count, 2); + + registry.remove(2); + assert!(registry.get(2).is_none()); + count = 0; + registry.for_each_opened_tablet(|_, _| { + count += 1; + true + }); + assert_eq!(count, 1); + + let name = registry.tablet_name("prefix", 12, 30); + assert_eq!(name, "prefix_12_30"); + let normal_name = registry.tablet_name("", 20, 15); + let normal_tablet_path = registry.tablet_path(20, 15); + assert_eq!(registry.tablet_root().join(normal_name), normal_tablet_path); + + let full_prefix_path = registry.tablet_root().join(name); + let res = registry.parse_tablet_name(&full_prefix_path); + assert_eq!(res, Some(("prefix", 12, 30))); + let res = registry.parse_tablet_name(&normal_tablet_path); + assert_eq!(res, Some(("", 20, 15))); + let invalid_path = registry.tablet_root().join("invalid_12"); + let res = registry.parse_tablet_name(&invalid_path); + assert_eq!(res, None); + } +} diff --git a/components/engine_traits/src/util.rs b/components/engine_traits/src/util.rs index dc1c187d3cb..a947ac3fe5b 100644 --- a/components/engine_traits/src/util.rs +++ b/components/engine_traits/src/util.rs @@ -1,5 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::{ + cmp, + collections::{BTreeMap, VecDeque}, + sync::atomic::{AtomicU64, Ordering}, +}; + use super::{Error, Result}; /// Check if key in range [`start_key`, `end_key`). @@ -21,3 +27,227 @@ pub fn check_key_in_range( }) } } + +/// An auxiliary counter to determine write order. Unlike sequence number, it is +/// guaranteed to be allocated contiguously. +static WRITE_COUNTER_ALLOCATOR: AtomicU64 = AtomicU64::new(0); +/// Everytime active memtable switched, this version should be increased. +static MEMTABLE_VERSION_COUNTER_ALLOCATOR: AtomicU64 = AtomicU64::new(0); +/// Max sequence number that was synced and persisted. +static MAX_SYNCED_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0); + +pub fn max_synced_sequence_number() -> u64 { + MAX_SYNCED_SEQUENCE_NUMBER.load(Ordering::SeqCst) +} + +pub fn current_memtable_version() -> u64 { + MEMTABLE_VERSION_COUNTER_ALLOCATOR.load(Ordering::SeqCst) +} + +pub trait MemtableEventNotifier: Send { + fn notify_memtable_sealed(&self, seqno: u64); + fn notify_memtable_flushed(&self, seqno: u64); +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct SequenceNumber { + number: u64, + // Version is actually the counter of memtables flushed. Once the version increased, indicates + // a memtable was flushed, then relations of a region buffered in memory could be merged into + // one and persisted into raftdb. + memtable_version: u64, + // start_counter is an identity of a write. It's used to check if all seqno of writes were + // received in the receiving end. + start_counter: u64, + // end_counter is the value of sequence number counter after a write. It's used for finding + // corresponding seqno of a counter. The corresponding seqno may be smaller or equal to the + // lastest seqno at the time of end_counter generated. + end_counter: u64, +} + +impl SequenceNumber { + pub fn pre_write() -> Self { + SequenceNumber { + number: 0, + start_counter: WRITE_COUNTER_ALLOCATOR.fetch_add(1, Ordering::SeqCst) + 1, + end_counter: 0, + memtable_version: 0, + } + } + + pub fn post_write(&mut self, number: u64) { + self.number = number; + self.end_counter = WRITE_COUNTER_ALLOCATOR.load(Ordering::SeqCst); + } + + pub fn max(left: Self, right: Self) -> Self { + cmp::max_by_key(left, right, |s| s.number) + } + + pub fn get_number(&self) -> u64 { + self.number + } + + pub fn get_version(&self) -> u64 { + self.memtable_version + } +} + +/// Receive all seqno and their counters, check the last committed seqno (a +/// seqno is considered committed if all `start_counter` before its +/// `end_counter` was received), and return the largest sequence number +/// received. +#[derive(Default)] +pub struct SequenceNumberWindow { + // Status of writes with start_counter starting from ack_start_counter+1. + write_status_window: VecDeque, + // writes with start_counter <= ack_start_counter are all committed. + ack_start_counter: u64, + // (end_counter, sequence number) + pending_sequence: BTreeMap, + // max corresponding sequence number before ack_start_counter. + committed_seqno: u64, + max_received_seqno: u64, +} + +impl SequenceNumberWindow { + pub fn push(&mut self, sn: SequenceNumber) { + // start_delta - 1 is the index of `write_status_window`. + let start_delta = match sn.start_counter.checked_sub(self.ack_start_counter) { + Some(delta) if delta > 0 => delta as usize, + _ => { + assert!(sn.number <= self.max_received_seqno); + return; + } + }; + self.max_received_seqno = u64::max(sn.number, self.max_received_seqno); + // Increase the length of `write_status_window` + if start_delta > self.write_status_window.len() { + self.write_status_window.resize(start_delta, false); + } + // Insert the seqno of `pending_sequence`. Because an `end_counter` + // may correspond to multiple seqno, we only keep the max seqno. + self.pending_sequence + .entry(sn.end_counter) + .and_modify(|value| { + *value = SequenceNumber::max(*value, sn); + }) + .or_insert(sn); + self.write_status_window[start_delta - 1] = true; + if start_delta != 1 { + return; + } + // Commit seqno of the counter which all smaller counter were received. + let mut acks = 0; + for received in self.write_status_window.iter() { + if *received { + acks += 1; + } else { + break; + } + } + self.write_status_window.drain(..acks); + self.ack_start_counter += acks as u64; + let mut sequences = self + .pending_sequence + .split_off(&(self.ack_start_counter + 1)); + std::mem::swap(&mut sequences, &mut self.pending_sequence); + if let Some(sequence) = sequences.values().max() { + assert!( + self.committed_seqno <= sequence.number, + "committed_seqno {}, seqno{}", + self.committed_seqno, + sequence.number + ); + self.committed_seqno = sequence.number; + } + } + + pub fn committed_seqno(&self) -> u64 { + self.committed_seqno + } + + pub fn pending_count(&self) -> usize { + self.write_status_window.len() + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use test::Bencher; + + use super::*; + + #[test] + fn test_sequence_number_window() { + let mut window = SequenceNumberWindow::default(); + let mut sn1 = SequenceNumber::pre_write(); + sn1.post_write(1); + window.push(sn1); + assert_eq!(window.committed_seqno(), 1); + let mut sn2 = SequenceNumber::pre_write(); + let mut sn3 = SequenceNumber::pre_write(); + let mut sn4 = SequenceNumber::pre_write(); + let mut sn5 = SequenceNumber::pre_write(); + sn5.post_write(3); + sn2.post_write(5); + sn3.post_write(2); + sn4.post_write(4); + window.push(sn2); + assert_eq!(window.committed_seqno(), 1); + window.push(sn5); + assert_eq!(window.committed_seqno(), 1); + window.push(sn3); + assert_eq!(window.committed_seqno(), 1); + window.push(sn4); + assert_eq!(window.committed_seqno(), 5); + let mut sn6 = SequenceNumber::pre_write(); + let mut sn7 = SequenceNumber::pre_write(); + sn6.post_write(7); + sn7.post_write(6); + let mut sn8 = SequenceNumber::pre_write(); + sn8.post_write(8); + window.push(sn6); + assert_eq!(window.committed_seqno(), 5); + window.push(sn7); + assert_eq!(window.committed_seqno(), 7); + window.push(sn8); + assert_eq!(window.committed_seqno(), 8); + } + + #[bench] + fn bench_sequence_number_window(b: &mut Bencher) { + fn produce_random_seqno(producer: usize, number: usize) -> Vec { + let mock_seqno_allocator = Arc::new(AtomicU64::new(1)); + let (tx, rx) = std::sync::mpsc::sync_channel(number); + let handles: Vec<_> = (0..producer) + .map(|_| { + let allocator = mock_seqno_allocator.clone(); + let count = number / producer; + let tx = tx.clone(); + std::thread::spawn(move || { + for _ in 0..count { + let mut sn = SequenceNumber::pre_write(); + sn.post_write(allocator.fetch_add(1, Ordering::AcqRel)); + tx.send(sn).unwrap(); + } + }) + }) + .collect(); + for h in handles { + h.join().unwrap(); + } + (0..number).map(|_| rx.recv().unwrap()).collect() + } + + let seqno = produce_random_seqno(16, 100000); + b.iter(|| { + let mut window = SequenceNumberWindow::default(); + for sn in &seqno { + window.push(*sn); + } + }) + } +} diff --git a/components/engine_traits/src/write_batch.rs b/components/engine_traits/src/write_batch.rs index 5d6824a7207..8a92ac7c382 100644 --- a/components/engine_traits/src/write_batch.rs +++ b/components/engine_traits/src/write_batch.rs @@ -71,10 +71,17 @@ pub trait Mutable: Send { /// save point, and pops the save point from the stack. pub trait WriteBatch: Mutable { /// Commit the WriteBatch to disk with the given options - fn write_opt(&self, opts: &WriteOptions) -> Result<()>; + fn write_opt(&mut self, opts: &WriteOptions) -> Result; + + // TODO: it should be `FnOnce`. + fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { + let seq = self.write_opt(opts)?; + cb(); + Ok(seq) + } /// Commit the WriteBatch to disk atomically - fn write(&self) -> Result<()> { + fn write(&mut self) -> Result { self.write_opt(&WriteOptions::default()) } diff --git a/components/engine_traits_tests/Cargo.toml b/components/engine_traits_tests/Cargo.toml index a011b1cc281..516135a86d2 100644 --- a/components/engine_traits_tests/Cargo.toml +++ b/components/engine_traits_tests/Cargo.toml @@ -25,8 +25,12 @@ test-engines-panic = [ ] [dependencies] -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -panic_hook = { path = "../panic_hook" } +encryption = { workspace = true } +encryption_export = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +kvproto = { workspace = true } +panic_hook = { workspace = true } tempfile = "3.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } +test_util = { workspace = true } diff --git a/components/engine_traits_tests/src/basic_read_write.rs b/components/engine_traits_tests/src/basic_read_write.rs index d5104ba57e3..38a1921dd85 100644 --- a/components/engine_traits_tests/src/basic_read_write.rs +++ b/components/engine_traits_tests/src/basic_read_write.rs @@ -2,7 +2,7 @@ //! Reading and writing -use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT, CF_WRITE}; +use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use super::engine_cfs; @@ -17,16 +17,3 @@ fn non_cf_methods_are_default_cf() { let value = value.expect("value"); assert_eq!(b"bar", &*value); } - -// CF_DEFAULT always exists -#[test] -fn non_cf_methods_implicit_default_cf() { - let db = engine_cfs(&[CF_WRITE]); - db.engine.put(b"foo", b"bar").unwrap(); - let value = db.engine.get_value(b"foo").unwrap(); - let value = value.expect("value"); - assert_eq!(b"bar", &*value); - let value = db.engine.get_value_cf(CF_DEFAULT, b"foo").unwrap(); - let value = value.expect("value"); - assert_eq!(b"bar", &*value); -} diff --git a/components/engine_traits_tests/src/cf_names.rs b/components/engine_traits_tests/src/cf_names.rs index 187df39a081..f85c2f5df97 100644 --- a/components/engine_traits_tests/src/cf_names.rs +++ b/components/engine_traits_tests/src/cf_names.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CFNamesExt, KvEngine, Snapshot, ALL_CFS, CF_DEFAULT, CF_WRITE}; +use engine_traits::{CfNamesExt, ALL_CFS, CF_DEFAULT}; use super::{default_engine, engine_cfs}; @@ -21,40 +21,3 @@ fn cf_names() { assert!(names.contains(cf)); } } - -#[test] -fn implicit_default_cf() { - let db = engine_cfs(&[CF_WRITE]); - let names = db.engine.cf_names(); - assert_eq!(names.len(), 2); - assert!(names.contains(&CF_DEFAULT)); -} - -#[test] -fn default_names_snapshot() { - let db = default_engine(); - let snapshot = db.engine.snapshot(); - let names = snapshot.cf_names(); - assert_eq!(names.len(), 1); - assert_eq!(names[0], CF_DEFAULT); -} - -#[test] -fn cf_names_snapshot() { - let db = engine_cfs(ALL_CFS); - let snapshot = db.engine.snapshot(); - let names = snapshot.cf_names(); - assert_eq!(names.len(), ALL_CFS.len()); - for cf in ALL_CFS { - assert!(names.contains(cf)); - } -} - -#[test] -fn implicit_default_cf_snapshot() { - let db = engine_cfs(&[CF_WRITE]); - let snapshot = db.engine.snapshot(); - let names = snapshot.cf_names(); - assert_eq!(names.len(), 2); - assert!(names.contains(&CF_DEFAULT)); -} diff --git a/components/engine_traits_tests/src/checkpoint.rs b/components/engine_traits_tests/src/checkpoint.rs new file mode 100644 index 00000000000..ad85b8f85ed --- /dev/null +++ b/components/engine_traits_tests/src/checkpoint.rs @@ -0,0 +1,49 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! Checkpoint tests + +use std::sync::Arc; + +use encryption_export::data_key_manager_from_config; +use engine_test::{ + ctor::{CfOptions, DbOptions, KvEngineConstructorExt}, + kv::KvTestEngine, +}; +use engine_traits::{ + Checkpointable, Checkpointer, KvEngine, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT, +}; + +use super::tempdir; + +#[test] +fn test_encrypted_checkpoint() { + let dir = tempdir(); + let root_path = dir.path(); + + let encryption_cfg = test_util::new_file_security_config(root_path); + let key_manager = Arc::new( + data_key_manager_from_config(&encryption_cfg, root_path.to_str().unwrap()) + .unwrap() + .unwrap(), + ); + + let mut db_opts = DbOptions::default(); + db_opts.set_key_manager(Some(key_manager)); + let cf_opts: Vec<_> = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + + let path1 = root_path.join("1").to_str().unwrap().to_owned(); + let db1 = KvTestEngine::new_kv_engine_opt(&path1, db_opts.clone(), cf_opts.clone()).unwrap(); + db1.put(b"foo", b"bar").unwrap(); + db1.sync().unwrap(); + + let path2 = root_path.join("2"); + let mut checkpointer = db1.new_checkpointer().unwrap(); + checkpointer.create_at(&path2, None, 0).unwrap(); + let db2 = + KvTestEngine::new_kv_engine_opt(path2.to_str().unwrap(), db_opts.clone(), cf_opts.clone()) + .unwrap(); + assert_eq!( + db2.get_value_cf(CF_DEFAULT, b"foo").unwrap().unwrap(), + b"bar" + ); +} diff --git a/components/engine_traits_tests/src/ctor.rs b/components/engine_traits_tests/src/ctor.rs index b3338a46367..dce6a64dff2 100644 --- a/components/engine_traits_tests/src/ctor.rs +++ b/components/engine_traits_tests/src/ctor.rs @@ -4,11 +4,12 @@ use std::fs; +use encryption_export::data_key_manager_from_config; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions, KvEngineConstructorExt}, + ctor::{CfOptions, DbOptions, KvEngineConstructorExt}, kv::KvTestEngine, }; -use engine_traits::{KvEngine, SyncMutable, ALL_CFS}; +use engine_traits::{EncryptionKeyManager, KvEngine, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use super::tempdir; @@ -16,18 +17,15 @@ use super::tempdir; fn new_engine_basic() { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let _db = KvTestEngine::new_kv_engine(path, None, ALL_CFS, None).unwrap(); + let _db = KvTestEngine::new_kv_engine(path, ALL_CFS).unwrap(); } #[test] fn new_engine_opt_basic() { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) - .collect(); + let db_opts = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let _db = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts).unwrap(); } @@ -37,7 +35,7 @@ fn new_engine_missing_dir() { let dir = tempdir(); let path = dir.path(); let path = path.join("missing").to_str().unwrap().to_owned(); - let db = KvTestEngine::new_kv_engine(&path, None, ALL_CFS, None).unwrap(); + let db = KvTestEngine::new_kv_engine(&path, ALL_CFS).unwrap(); db.put(b"foo", b"bar").unwrap(); db.sync().unwrap(); } @@ -47,11 +45,8 @@ fn new_engine_opt_missing_dir() { let dir = tempdir(); let path = dir.path(); let path = path.join("missing").to_str().unwrap().to_owned(); - let db_opts = DBOptions::default(); - let cf_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) - .collect(); + let db_opts = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let db = KvTestEngine::new_kv_engine_opt(&path, db_opts, cf_opts).unwrap(); db.put(b"foo", b"bar").unwrap(); db.sync().unwrap(); @@ -71,9 +66,9 @@ fn new_engine_readonly_dir() { fs::set_permissions(&path, perms).unwrap(); let path = path.to_str().unwrap(); - let err = KvTestEngine::new_kv_engine(path, None, ALL_CFS, None); + let err = KvTestEngine::new_kv_engine(path, ALL_CFS); - assert!(err.is_err()); + err.unwrap_err(); } #[test] @@ -90,12 +85,46 @@ fn new_engine_opt_readonly_dir() { fs::set_permissions(&path, perms).unwrap(); let path = path.to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) - .collect(); + let db_opts = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let err = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts); - assert!(err.is_err()); + err.unwrap_err(); +} + +#[test] +fn new_engine_opt_renamed_dir() { + use std::sync::Arc; + let dir = tempdir(); + let root_path = dir.path(); + + let encryption_cfg = test_util::new_file_security_config(root_path); + let key_manager = Arc::new( + data_key_manager_from_config(&encryption_cfg, root_path.to_str().unwrap()) + .unwrap() + .unwrap(), + ); + + let mut db_opts = DbOptions::default(); + db_opts.set_key_manager(Some(key_manager.clone())); + let cf_opts: Vec<_> = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + + let path = root_path.join("missing").to_str().unwrap().to_owned(); + { + let db = KvTestEngine::new_kv_engine_opt(&path, db_opts.clone(), cf_opts.clone()).unwrap(); + db.put(b"foo", b"bar").unwrap(); + db.sync().unwrap(); + } + let new_path = root_path.join("new").to_str().unwrap().to_owned(); + key_manager.link_file(&path, &new_path).unwrap(); + fs::rename(&path, &new_path).unwrap(); + key_manager.delete_file(&path).unwrap(); + { + let db = + KvTestEngine::new_kv_engine_opt(&new_path, db_opts.clone(), cf_opts.clone()).unwrap(); + assert_eq!( + db.get_value_cf(CF_DEFAULT, b"foo").unwrap().unwrap(), + b"bar" + ); + } } diff --git a/components/engine_traits_tests/src/delete_range.rs b/components/engine_traits_tests/src/delete_range.rs index c2b87395d6a..bdfba737048 100644 --- a/components/engine_traits_tests/src/delete_range.rs +++ b/components/engine_traits_tests/src/delete_range.rs @@ -8,10 +8,8 @@ use super::default_engine; #[test] fn delete_range_cf_bad_cf() { let db = default_engine(); - assert!( - recover_safe(|| { - db.engine.delete_range_cf("bogus", b"a", b"b").unwrap(); - }) - .is_err() - ); + recover_safe(|| { + db.engine.delete_range_cf("bogus", b"a", b"b").unwrap(); + }) + .unwrap_err(); } diff --git a/components/engine_traits_tests/src/iterator.rs b/components/engine_traits_tests/src/iterator.rs index 00f7a974b52..714ca4cb0b4 100644 --- a/components/engine_traits_tests/src/iterator.rs +++ b/components/engine_traits_tests/src/iterator.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{Iterable, Iterator, KvEngine, SeekKey}; +use engine_traits::{Iterable, Iterator, KvEngine, CF_DEFAULT}; use panic_hook::recover_safe; use super::default_engine; @@ -15,39 +15,33 @@ where assert_eq!(iter.valid().unwrap(), false); - assert!(iter.prev().is_err()); - assert!(iter.next().is_err()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); - - assert_eq!(iter.seek(SeekKey::Start).unwrap(), false); - assert_eq!(iter.seek(SeekKey::End).unwrap(), false); - assert_eq!(iter.seek(SeekKey::Key(b"foo")).unwrap(), false); - assert_eq!(iter.seek_for_prev(SeekKey::Start).unwrap(), false); - assert_eq!(iter.seek_for_prev(SeekKey::End).unwrap(), false); - assert_eq!(iter.seek_for_prev(SeekKey::Key(b"foo")).unwrap(), false); + iter.prev().unwrap_err(); + iter.next().unwrap_err(); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); + + assert_eq!(iter.seek_to_first().unwrap(), false); + assert_eq!(iter.seek_to_last().unwrap(), false); + assert_eq!(iter.seek(b"foo").unwrap(), false); + assert_eq!(iter.seek_for_prev(b"foo").unwrap(), false); } #[test] fn iter_empty_engine() { let db = default_engine(); - iter_empty(&db.engine, |e| e.iterator().unwrap()); + iter_empty(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_empty_snapshot() { let db = default_engine(); - iter_empty(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_empty(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_forward(e: &E, i: IF) @@ -64,7 +58,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::Start).unwrap()); + assert!(iter.seek_to_first().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"a"); @@ -86,30 +80,26 @@ where assert!(!iter.valid().unwrap()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); } #[test] fn iter_forward_engine() { let db = default_engine(); - iter_forward(&db.engine, |e| e.iterator().unwrap()); + iter_forward(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_forward_snapshot() { let db = default_engine(); - iter_forward(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_reverse(e: &E, i: IF) @@ -126,7 +116,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::End).unwrap()); + assert!(iter.seek_to_last().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); @@ -148,30 +138,26 @@ where assert!(!iter.valid().unwrap()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); } #[test] fn iter_reverse_engine() { let db = default_engine(); - iter_reverse(&db.engine, |e| e.iterator().unwrap()); + iter_reverse(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_reverse_snapshot() { let db = default_engine(); - iter_reverse(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_to_key_then_forward(e: &E, i: IF) @@ -186,7 +172,7 @@ where let mut iter = i(e); - assert!(iter.seek(SeekKey::Key(b"b")).unwrap()); + assert!(iter.seek(b"b").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"b"); @@ -206,13 +192,13 @@ where #[test] fn seek_to_key_then_forward_engine() { let db = default_engine(); - seek_to_key_then_forward(&db.engine, |e| e.iterator().unwrap()); + seek_to_key_then_forward(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_to_key_then_forward_snapshot() { let db = default_engine(); - seek_to_key_then_forward(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_to_key_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_to_key_then_reverse(e: &E, i: IF) @@ -227,7 +213,7 @@ where let mut iter = i(e); - assert!(iter.seek(SeekKey::Key(b"b")).unwrap()); + assert!(iter.seek(b"b").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"b"); @@ -247,13 +233,13 @@ where #[test] fn seek_to_key_then_reverse_engine() { let db = default_engine(); - seek_to_key_then_reverse(&db.engine, |e| e.iterator().unwrap()); + seek_to_key_then_reverse(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_to_key_then_reverse_snapshot() { let db = default_engine(); - seek_to_key_then_reverse(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_to_key_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_forward_then_reverse(e: &E, i: IF) @@ -270,7 +256,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::Start).unwrap()); + assert!(iter.seek_to_first().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"a"); @@ -308,13 +294,13 @@ where #[test] fn iter_forward_then_reverse_engine() { let db = default_engine(); - iter_forward_then_reverse(&db.engine, |e| e.iterator().unwrap()); + iter_forward_then_reverse(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_forward_then_reverse_snapshot() { let db = default_engine(); - iter_forward_then_reverse(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_forward_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_reverse_then_forward(e: &E, i: IF) @@ -331,7 +317,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::End).unwrap()); + assert!(iter.seek_to_last().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); @@ -369,13 +355,13 @@ where #[test] fn iter_reverse_then_forward_engine() { let db = default_engine(); - iter_reverse_then_forward(&db.engine, |e| e.iterator().unwrap()); + iter_reverse_then_forward(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_reverse_then_forward_snapshot() { let db = default_engine(); - iter_reverse_then_forward(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_reverse_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } // When seek finds an exact key then seek_for_prev behaves just like seek @@ -391,19 +377,19 @@ where let mut iter = i(e); - assert!(iter.seek_for_prev(SeekKey::Start).unwrap()); + assert!(iter.seek_to_first().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"a"); assert_eq!(iter.value(), b"a"); - assert!(iter.seek_for_prev(SeekKey::End).unwrap()); + assert!(iter.seek_to_last().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); assert_eq!(iter.value(), b"c"); - assert!(iter.seek_for_prev(SeekKey::Key(b"c")).unwrap()); + assert!(iter.seek_for_prev(b"c").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); @@ -413,13 +399,13 @@ where #[test] fn seek_for_prev_engine() { let db = default_engine(); - seek_for_prev(&db.engine, |e| e.iterator().unwrap()); + seek_for_prev(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_for_prev_snapshot() { let db = default_engine(); - seek_for_prev(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_for_prev(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } // When Seek::Key doesn't find an exact match, @@ -437,24 +423,24 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::Key(b"b")).unwrap()); + assert!(iter.seek(b"b").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); - assert!(!iter.seek(SeekKey::Key(b"d")).unwrap()); + assert!(!iter.seek(b"d").unwrap()); assert!(!iter.valid().unwrap()); } #[test] fn seek_key_miss_engine() { let db = default_engine(); - seek_key_miss(&db.engine, |e| e.iterator().unwrap()); + seek_key_miss(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_key_miss_snapshot() { let db = default_engine(); - seek_key_miss(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_key_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_key_prev_miss(e: &E, i: IF) @@ -469,22 +455,22 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek_for_prev(SeekKey::Key(b"d")).unwrap()); + assert!(iter.seek_for_prev(b"d").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); - assert!(!iter.seek_for_prev(SeekKey::Key(b"b")).unwrap()); + assert!(!iter.seek_for_prev(b"b").unwrap()); assert!(!iter.valid().unwrap()); } #[test] fn seek_key_prev_miss_engine() { let db = default_engine(); - seek_key_prev_miss(&db.engine, |e| e.iterator().unwrap()); + seek_key_prev_miss(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_key_prev_miss_snapshot() { let db = default_engine(); - seek_key_prev_miss(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_key_prev_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } diff --git a/components/engine_traits_tests/src/lib.rs b/components/engine_traits_tests/src/lib.rs index 49fe26b4f4d..1d9b6b4fa53 100644 --- a/components/engine_traits_tests/src/lib.rs +++ b/components/engine_traits_tests/src/lib.rs @@ -40,6 +40,7 @@ mod basic_read_write; mod cf_names; +mod checkpoint; mod ctor; mod delete_range; mod iterator; @@ -64,7 +65,30 @@ fn default_engine() -> TempDirEnginePair { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let engine = KvTestEngine::new_kv_engine(path, None, &[CF_DEFAULT], None).unwrap(); + let engine = KvTestEngine::new_kv_engine(path, &[CF_DEFAULT]).unwrap(); + TempDirEnginePair { + engine, + tempdir: dir, + } +} + +/// Create a multi batch write engine with only CF_DEFAULT +fn multi_batch_write_engine() -> TempDirEnginePair { + use engine_test::{ + ctor::{ + CfOptions as KvTestCfOptions, DbOptions as KvTestDbOptions, KvEngineConstructorExt, + }, + kv::KvTestEngine, + }; + use engine_traits::CF_DEFAULT; + + let dir = tempdir(); + let path = dir.path().to_str().unwrap(); + let mut opt = KvTestDbOptions::default(); + opt.set_enable_multi_batch_write(true); + let engine = + KvTestEngine::new_kv_engine_opt(path, opt, vec![(CF_DEFAULT, KvTestCfOptions::new())]) + .unwrap(); TempDirEnginePair { engine, tempdir: dir, @@ -77,7 +101,7 @@ fn engine_cfs(cfs: &[&str]) -> TempDirEnginePair { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let engine = KvTestEngine::new_kv_engine(path, None, cfs, None).unwrap(); + let engine = KvTestEngine::new_kv_engine(path, cfs).unwrap(); TempDirEnginePair { engine, tempdir: dir, diff --git a/components/engine_traits_tests/src/read_consistency.rs b/components/engine_traits_tests/src/read_consistency.rs index d80b6b3db7c..8c7ab50657f 100644 --- a/components/engine_traits_tests/src/read_consistency.rs +++ b/components/engine_traits_tests/src/read_consistency.rs @@ -2,7 +2,7 @@ //! Testing iterator and snapshot behavior in the presence of intermixed writes -use engine_traits::{Iterable, Iterator, KvEngine, Peekable, SyncMutable}; +use engine_traits::{Iterable, Iterator, KvEngine, Peekable, SyncMutable, CF_DEFAULT}; use super::default_engine; @@ -71,11 +71,11 @@ where #[test] fn iterator_with_writes_engine() { let db = default_engine(); - iterator_with_writes(&db.engine, |e| e.iterator().unwrap()); + iterator_with_writes(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iterator_with_writes_snapshot() { let db = default_engine(); - iterator_with_writes(&db.engine, |e| e.snapshot().iterator().unwrap()); + iterator_with_writes(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } diff --git a/components/engine_traits_tests/src/scenario_writes.rs b/components/engine_traits_tests/src/scenario_writes.rs index 3e250c21198..169be158006 100644 --- a/components/engine_traits_tests/src/scenario_writes.rs +++ b/components/engine_traits_tests/src/scenario_writes.rs @@ -10,7 +10,7 @@ use panic_hook::recover_safe; use super::engine_cfs; #[allow(clippy::enum_variant_names)] -#[derive(Eq, PartialEq)] +#[derive(PartialEq)] enum WriteScenario { NoCf, DefaultCf, @@ -42,17 +42,20 @@ impl WriteScenarioEngine { WriteBatchNoCf => { let mut wb = self.db.engine.write_batch(); wb.put(key, value)?; - wb.write() + wb.write()?; + Ok(()) } WriteBatchDefaultCf => { let mut wb = self.db.engine.write_batch(); wb.put_cf(CF_DEFAULT, key, value)?; - wb.write() + wb.write()?; + Ok(()) } WriteBatchOtherCf => { let mut wb = self.db.engine.write_batch(); wb.put_cf(CF_WRITE, key, value)?; - wb.write() + wb.write()?; + Ok(()) } } } @@ -66,17 +69,20 @@ impl WriteScenarioEngine { WriteBatchNoCf => { let mut wb = self.db.engine.write_batch(); wb.delete(key)?; - wb.write() + wb.write()?; + Ok(()) } WriteBatchDefaultCf => { let mut wb = self.db.engine.write_batch(); wb.delete_cf(CF_DEFAULT, key)?; - wb.write() + wb.write()?; + Ok(()) } WriteBatchOtherCf => { let mut wb = self.db.engine.write_batch(); wb.delete_cf(CF_WRITE, key)?; - wb.write() + wb.write()?; + Ok(()) } } } @@ -90,22 +96,25 @@ impl WriteScenarioEngine { WriteBatchNoCf => { let mut wb = self.db.engine.write_batch(); wb.delete_range(start, end)?; - wb.write() + wb.write()?; + Ok(()) } WriteBatchDefaultCf => { let mut wb = self.db.engine.write_batch(); wb.delete_range_cf(CF_DEFAULT, start, end)?; - wb.write() + wb.write()?; + Ok(()) } WriteBatchOtherCf => { let mut wb = self.db.engine.write_batch(); wb.delete_range_cf(CF_WRITE, start, end)?; - wb.write() + wb.write()?; + Ok(()) } } } - fn get_value(&self, key: &[u8]) -> Result::DBVector>> { + fn get_value(&self, key: &[u8]) -> Result::DbVector>> { use WriteScenario::*; match self.scenario { NoCf | DefaultCf | WriteBatchNoCf | WriteBatchDefaultCf => { @@ -213,8 +222,7 @@ scenario_test! { put_get { scenario_test! { delete_none { let db = write_scenario_engine(); - let res = db.delete(b"foo"); - assert!(res.is_ok()); + db.delete(b"foo").unwrap(); }} scenario_test! { delete { @@ -280,9 +288,9 @@ scenario_test! { delete_range_reverse_range { db.put(b"c", b"").unwrap(); db.put(b"d", b"").unwrap(); - assert!(recover_safe(|| { + recover_safe(|| { db.delete_range(b"d", b"b").unwrap(); - }).is_err()); + }).unwrap_err(); assert!(db.get_value(b"b").unwrap().is_some()); assert!(db.get_value(b"c").unwrap().is_some()); diff --git a/components/engine_traits_tests/src/sst.rs b/components/engine_traits_tests/src/sst.rs index 10104e752cc..26ed686aad4 100644 --- a/components/engine_traits_tests/src/sst.rs +++ b/components/engine_traits_tests/src/sst.rs @@ -6,8 +6,8 @@ use std::fs; use engine_test::kv::KvTestEngine; use engine_traits::{ - Error, ExternalSstFileInfo, Iterator, Result, SeekKey, SstExt, SstReader, SstWriter, - SstWriterBuilder, + Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstExt, SstReader, + SstWriter, SstWriterBuilder, }; use panic_hook::recover_safe; @@ -49,9 +49,9 @@ fn basic() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); assert_eq!(b"k1", key); @@ -78,9 +78,9 @@ fn forward() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); @@ -115,9 +115,9 @@ fn reverse() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); - iter.seek(SeekKey::End)?; + iter.seek_to_last()?; let key = iter.key(); let value = iter.value(); @@ -136,7 +136,7 @@ fn reverse() -> Result<()> { Ok(()) } -// todo test seek_for_prev(SeekKey::Key) +// todo test seek_for_prev(Key) #[test] fn delete() -> Result<()> { @@ -153,33 +153,27 @@ fn delete() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; assert_eq!(iter.valid()?, false); - assert!(iter.prev().is_err()); - assert!(iter.next().is_err()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); - - assert_eq!(iter.seek(SeekKey::Start)?, false); - assert_eq!(iter.seek(SeekKey::End)?, false); - assert_eq!(iter.seek(SeekKey::Key(b"foo"))?, false); - assert_eq!(iter.seek_for_prev(SeekKey::Start)?, false); - assert_eq!(iter.seek_for_prev(SeekKey::End)?, false); - assert_eq!(iter.seek_for_prev(SeekKey::Key(b"foo"))?, false); + iter.prev().unwrap_err(); + iter.next().unwrap_err(); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); + + assert_eq!(iter.seek_to_first()?, false); + assert_eq!(iter.seek_to_last()?, false); + assert_eq!(iter.seek(b"foo")?, false); + assert_eq!(iter.seek_for_prev(b"foo")?, false); Ok(()) } @@ -213,9 +207,9 @@ fn same_key() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); assert_eq!(b"k1", key); @@ -255,9 +249,9 @@ fn reverse_key() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); assert_eq!(b"k2", key); diff --git a/components/engine_traits_tests/src/write_batch.rs b/components/engine_traits_tests/src/write_batch.rs index 0210dee3806..f13cec0845a 100644 --- a/components/engine_traits_tests/src/write_batch.rs +++ b/components/engine_traits_tests/src/write_batch.rs @@ -4,19 +4,27 @@ use engine_test::kv::KvTestEngine; use engine_traits::{Mutable, Peekable, SyncMutable, WriteBatch, WriteBatchExt}; use panic_hook::recover_safe; -use super::{assert_engine_error, default_engine}; +use super::{assert_engine_error, default_engine, multi_batch_write_engine}; #[test] fn write_batch_none_no_commit() { let db = default_engine(); let wb = db.engine.write_batch(); drop(wb); + + let db = multi_batch_write_engine(); + let wb = db.engine.write_batch_with_cap(1024); + drop(wb); } #[test] fn write_batch_none() { let db = default_engine(); - let wb = db.engine.write_batch(); + let mut wb = db.engine.write_batch(); + wb.write().unwrap(); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); wb.write().unwrap(); } @@ -31,6 +39,28 @@ fn write_batch_put() { wb.write().unwrap(); assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..128_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + for i in 128..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + for i in 0..256_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -46,6 +76,33 @@ fn write_batch_delete() { wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_none()); + + let db = multi_batch_write_engine(); + + for i in 0..127_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + db.engine.put(b"a", b"aa").unwrap(); + for i in 127..255_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..255_usize { + let k = i.to_be_bytes(); + wb.delete(&k).unwrap(); + } + wb.delete(b"a").unwrap(); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..255_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -60,6 +117,25 @@ fn write_batch_write_twice_1() { wb.write().unwrap(); assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..123_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + + wb.write().unwrap(); + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + for i in 0..123_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -78,6 +154,40 @@ fn write_batch_write_twice_2() { wb.write().unwrap(); assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..128_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + + wb.write().unwrap(); + + db.engine.put(b"a", b"b").unwrap(); + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"b"); + + for i in 0..128_usize { + let k = i.to_be_bytes(); + let v = (2 * i + 1).to_be_bytes(); + db.engine.put(&k, &v).unwrap(); + } + for i in 0..128_usize { + let k = i.to_be_bytes(); + let v = (2 * i + 1).to_be_bytes(); + assert_eq!(db.engine.get_value(&k).unwrap().unwrap(), &v); + } + + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + for i in 0..128_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -95,6 +205,37 @@ fn write_batch_write_twice_3() { assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); assert_eq!(db.engine.get_value(b"b").unwrap().unwrap(), b"bb"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..128_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + + wb.write().unwrap(); + for i in 0..128_usize { + let k = i.to_be_bytes(); + let v = (2 * i + 1).to_be_bytes(); + db.engine.put(&k, &v).unwrap(); + } + db.engine.put(b"a", b"b").unwrap(); + for i in 128..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"b", b"bb").unwrap(); + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + assert_eq!(db.engine.get_value(b"b").unwrap().unwrap(), b"bb"); + for i in 0..256_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -117,6 +258,43 @@ fn write_batch_delete_range_basic() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + + let mut wb = db.engine.write_batch_with_cap(1024); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&32_usize.to_be_bytes(), &128_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + for i in 0..32_usize { + let x = i.to_be_bytes(); + assert!(db.engine.get_value(&x).unwrap().is_some()); + } + for i in 32..128_usize { + let x = i.to_be_bytes(); + assert!(db.engine.get_value(&x).unwrap().is_none()); + } + for i in 128..256_usize { + let x = i.to_be_bytes(); + assert!(db.engine.get_value(&x).unwrap().is_some()); + } } #[test] @@ -141,6 +319,54 @@ fn write_batch_delete_range_inexact() { assert!(db.engine.get_value(b"e").unwrap().is_none()); assert!(db.engine.get_value(b"f").unwrap().is_none()); assert!(db.engine.get_value(b"g").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + db.engine.put(b"g", b"").unwrap(); + + let mut wb = db.engine.write_batch_with_cap(1024); + for i in (0..256_usize).step_by(2) { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.delete_range(b"b", b"f").unwrap(); + wb.delete_range(&0_usize.to_be_bytes(), &252_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_none()); + assert!(db.engine.get_value(b"f").unwrap().is_none()); + assert!(db.engine.get_value(b"g").unwrap().is_some()); + for i in 0..252_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + assert!( + db.engine + .get_value(&252_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&253_usize.to_be_bytes()) + .unwrap() + .is_none() + ); + assert!( + db.engine + .get_value(&254_usize.to_be_bytes()) + .unwrap() + .is_some() + ); } #[test] @@ -161,6 +387,43 @@ fn write_batch_delete_range_after_put() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.put(b"c", b"").unwrap(); + wb.put(b"d", b"").unwrap(); + wb.put(b"e", b"").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &255_usize.to_be_bytes()) + .unwrap(); + wb.delete_range(b"b", b"e").unwrap(); + wb.write().unwrap(); + + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..255_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + assert!( + db.engine + .get_value(&255_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); } #[test] @@ -180,6 +443,37 @@ fn write_batch_delete_range_none() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -203,6 +497,43 @@ fn write_batch_delete_range_twice() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + + let mut wb = db.engine.write_batch_with_cap(1024); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -226,6 +557,43 @@ fn write_batch_delete_range_twice_1() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -251,6 +619,49 @@ fn write_batch_delete_range_twice_2() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + db.engine.put(b"c", b"").unwrap(); + for i in 64..128_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -269,6 +680,30 @@ fn write_batch_delete_range_empty_range() { assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); assert!(db.engine.get_value(b"c").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"b").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &1_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_some()); + assert!(db.engine.get_value(b"c").unwrap().is_some()); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } } #[test] @@ -282,16 +717,43 @@ fn write_batch_delete_range_backward_range() { let mut wb = db.engine.write_batch(); wb.delete_range(b"c", b"a").unwrap(); - assert!( - recover_safe(|| { - wb.write().unwrap(); - }) - .is_err() - ); + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_some()); + assert!(db.engine.get_value(b"c").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"c", b"a").unwrap(); + wb.delete_range(&256_usize.to_be_bytes(), &0_usize.to_be_bytes()) + .unwrap(); + + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); assert!(db.engine.get_value(b"c").unwrap().is_some()); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } } #[test] @@ -321,12 +783,56 @@ fn write_batch_delete_range_backward_range_partial_commit() { wb.put(b"f", b"").unwrap(); wb.delete(b"a").unwrap(); - assert!( - recover_safe(|| { - wb.write().unwrap(); - }) - .is_err() - ); + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_some()); + assert!(db.engine.get_value(b"c").unwrap().is_some()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!(db.engine.get_value(b"f").unwrap().is_none()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + // Everything in the write batch before the panic + // due to bad range is going to end up committed. + // + // NB: This behavior seems pretty questionable and + // should probably be re-evaluated before other engines + // try to emulate it. + // + // A more reasonable solution might be to have a bogus + // delete_range request immediately panic. + wb.put(b"e", b"").unwrap(); + wb.delete(b"d").unwrap(); + wb.delete_range(b"c", b"a").unwrap(); + wb.put(b"f", b"").unwrap(); + wb.delete(b"a").unwrap(); + wb.delete_range(&128_usize.to_be_bytes(), &64_usize.to_be_bytes()) + .unwrap(); + wb.put(&256_usize.to_be_bytes(), b"").unwrap(); + for i in 0..64_usize { + wb.delete(&i.to_be_bytes()).unwrap(); + } + + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); @@ -346,6 +852,18 @@ fn write_batch_is_empty() { assert!(!wb.is_empty()); wb.write().unwrap(); assert!(!wb.is_empty()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + assert!(wb.is_empty()); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + assert!(!wb.is_empty()); + wb.write().unwrap(); + assert!(!wb.is_empty()); } #[test] @@ -358,6 +876,17 @@ fn write_batch_count() { assert_eq!(wb.count(), 1); wb.write().unwrap(); assert_eq!(wb.count(), 1); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + assert_eq!(wb.count(), 0); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + assert_eq!(wb.count(), 256); + wb.write().unwrap(); + assert_eq!(wb.count(), 256); } #[test] @@ -374,6 +903,23 @@ fn write_batch_count_2() { assert_eq!(wb.count(), 3); wb.write().unwrap(); assert_eq!(wb.count(), 3); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + assert_eq!(wb.count(), 0); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + assert_eq!(wb.count(), 257); + wb.delete(b"a").unwrap(); + assert_eq!(wb.count(), 258); + wb.delete_range(b"a", b"b").unwrap(); + assert_eq!(wb.count(), 259); + wb.write().unwrap(); + assert_eq!(wb.count(), 259); } #[test] @@ -388,6 +934,21 @@ fn write_batch_clear() { assert_eq!(wb.count(), 0); wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.clear(); + assert!(wb.is_empty()); + assert_eq!(wb.count(), 0); + wb.write().unwrap(); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -403,6 +964,40 @@ fn cap_zero() { wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"f").unwrap().is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(0); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.put(b"c", b"").unwrap(); + wb.put(b"d", b"").unwrap(); + wb.put(b"e", b"").unwrap(); + wb.put(b"f", b"").unwrap(); + wb.write().unwrap(); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&123_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&255_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"f").unwrap().is_some()); } /// Write batch capacity seems to just be a suggestions @@ -419,6 +1014,41 @@ fn cap_two() { wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"f").unwrap().is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(2); + + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.put(b"c", b"").unwrap(); + wb.put(b"d", b"").unwrap(); + wb.put(b"e", b"").unwrap(); + wb.put(b"f", b"").unwrap(); + wb.write().unwrap(); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&123_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&255_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"f").unwrap().is_some()); } // We should write when count is greater than WRITE_BATCH_MAX_KEYS @@ -441,6 +1071,24 @@ fn should_write_to_engine() { break; } } + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = KvTestEngine::WRITE_BATCH_MAX_KEYS; + + let mut key = vec![]; + loop { + key.push(b'a'); + wb.put(&key, b"").unwrap(); + if key.len() <= max_keys { + assert!(!wb.should_write_to_engine()); + } + if key.len() == max_keys + 1 { + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + break; + } + } } // But there kind of aren't consequences for making huge write batches @@ -475,6 +1123,37 @@ fn should_write_to_engine_but_whatever() { break; } } + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = KvTestEngine::WRITE_BATCH_MAX_KEYS; + + let mut key = vec![]; + + loop { + key.push(b'a'); + wb.put(&key, b"").unwrap(); + if key.len() <= max_keys { + assert!(!wb.should_write_to_engine()); + } + if key.len() > max_keys { + assert!(wb.should_write_to_engine()); + } + if key.len() == max_keys * 2 { + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + break; + } + } + + let mut key = vec![]; + loop { + key.push(b'a'); + assert!(db.engine.get_value(&key).unwrap().is_some()); + if key.len() == max_keys * 2 { + break; + } + } } #[test] @@ -504,6 +1183,43 @@ fn data_size() { wb.clear(); let size8 = wb.data_size(); assert_eq!(size8, size1); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + let size1 = wb.data_size(); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + let size2 = wb.data_size(); + assert!(size1 < size2); + wb.write().unwrap(); + let size3 = wb.data_size(); + assert_eq!(size2, size3); + wb.clear(); + let size4 = wb.data_size(); + assert_eq!(size4, size1); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + let size5 = wb.data_size(); + assert!(size4 < size5); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.delete(&x).unwrap(); + } + let size6 = wb.data_size(); + assert!(size5 < size6); + wb.delete_range(&0_usize.to_be_bytes(), &(max_keys * 2).to_be_bytes()) + .unwrap(); + let size7 = wb.data_size(); + assert!(size6 < size7); + wb.clear(); + let size8 = wb.data_size(); + assert_eq!(size8, size1); } #[test] @@ -513,6 +1229,12 @@ fn save_point_rollback_none() { let err = wb.rollback_to_save_point(); assert_engine_error(err); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); } #[test] @@ -522,14 +1244,40 @@ fn save_point_pop_none() { let err = wb.rollback_to_save_point(); assert_engine_error(err); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); } -#[test] -fn save_point_rollback_one() { - let db = default_engine(); - let mut wb = db.engine.write_batch(); +#[test] +fn save_point_rollback_one() { + let db = default_engine(); + let mut wb = db.engine.write_batch(); + + wb.set_save_point(); + wb.put(b"a", b"").unwrap(); + + wb.rollback_to_save_point().unwrap(); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); + let err = wb.pop_save_point(); + assert_engine_error(err); + wb.write().unwrap(); + let val = db.engine.get_value(b"a").unwrap(); + assert!(val.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); wb.set_save_point(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } wb.put(b"a", b"").unwrap(); wb.rollback_to_save_point().unwrap(); @@ -539,6 +1287,9 @@ fn save_point_rollback_one() { let err = wb.pop_save_point(); assert_engine_error(err); wb.write().unwrap(); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } let val = db.engine.get_value(b"a").unwrap(); assert!(val.is_none()); } @@ -565,6 +1316,39 @@ fn save_point_rollback_two() { assert!(a.is_none()); let b = db.engine.get_value(b"b").unwrap(); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + wb.set_save_point(); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.set_save_point(); + for i in max_keys..2 * max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"b", b"").unwrap(); + + wb.rollback_to_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); + let err = wb.pop_save_point(); + assert_engine_error(err); + wb.write().unwrap(); + let a = db.engine.get_value(b"a").unwrap(); + assert!(a.is_none()); + let b = db.engine.get_value(b"b").unwrap(); + assert!(b.is_none()); + for i in 0..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -582,6 +1366,35 @@ fn save_point_rollback_partial() { assert!(a.is_some()); let b = db.engine.get_value(b"b").unwrap(); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.set_save_point(); + wb.put(b"b", b"").unwrap(); + for i in max_keys..2 * max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.rollback_to_save_point().unwrap(); + wb.write().unwrap(); + let a = db.engine.get_value(b"a").unwrap(); + assert!(a.is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + let b = db.engine.get_value(b"b").unwrap(); + assert!(b.is_none()); + for i in max_keys..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -606,6 +1419,38 @@ fn save_point_pop_rollback() { assert!(val.is_none()); let val = db.engine.get_value(b"b").unwrap(); assert!(val.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.set_save_point(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.set_save_point(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.pop_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); + let err = wb.pop_save_point(); + assert_engine_error(err); + wb.write().unwrap(); + let val = db.engine.get_value(b"a").unwrap(); + assert!(val.is_none()); + let val = db.engine.get_value(b"b").unwrap(); + assert!(val.is_none()); + for i in 0..512_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -631,6 +1476,41 @@ fn save_point_rollback_after_write() { let val = db.engine.get_value(b"a").unwrap(); assert!(val.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + wb.set_save_point(); + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + db.engine.delete(b"a").unwrap(); + for i in 0..max_keys { + db.engine.delete(&i.to_be_bytes()).unwrap(); + } + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + + wb.rollback_to_save_point().unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -655,6 +1535,38 @@ fn save_point_same_rollback_one() { assert!(a.is_some()); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.set_save_point(); + wb.set_save_point(); + wb.set_save_point(); + + wb.put(b"b", b"").unwrap(); + for i in max_keys..2 * max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + wb.rollback_to_save_point().unwrap(); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + assert!(db.engine.get_value(b"b").unwrap().is_none()); + for i in max_keys..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -684,6 +1596,43 @@ fn save_point_same_rollback_all() { assert!(a.is_some()); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.set_save_point(); + wb.set_save_point(); + wb.set_save_point(); + + wb.put(b"b", b"").unwrap(); + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + wb.rollback_to_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + + assert_engine_error(wb.pop_save_point()); + assert_engine_error(wb.rollback_to_save_point()); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + assert!(db.engine.get_value(b"b").unwrap().is_none()); + for i in max_keys..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -709,6 +1658,41 @@ fn save_point_pop_after_write() { let val = db.engine.get_value(b"a").unwrap(); assert!(val.is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + wb.set_save_point(); + wb.put(b"a", b"").unwrap(); + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + db.engine.delete(b"a").unwrap(); + for i in 0..max_keys { + db.engine.delete(&i.to_be_bytes()).unwrap(); + } + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + + wb.pop_save_point().unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } } #[test] @@ -733,6 +1717,42 @@ fn save_point_all_commands() { assert!(a.is_some()); assert!(b.is_none()); assert!(d.is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys / 2 { + db.engine.put(&i.to_be_bytes(), b"").unwrap(); + } + db.engine.put(b"a", b"").unwrap(); + for i in max_keys / 2..max_keys { + db.engine.put(&i.to_be_bytes(), b"").unwrap(); + } + db.engine.put(b"d", b"").unwrap(); + + wb.set_save_point(); + for i in 0..max_keys / 2 { + wb.delete(&i.to_be_bytes()).unwrap(); + } + wb.delete(b"a").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.delete_range(b"c", b"e").unwrap(); + wb.delete_range(&(max_keys / 3).to_be_bytes(), &(2 * max_keys).to_be_bytes()) + .unwrap(); + + wb.rollback_to_save_point().unwrap(); + wb.write().unwrap(); + + let a = db.engine.get_value(b"a").unwrap(); + let b = db.engine.get_value(b"b").unwrap(); + let d = db.engine.get_value(b"d").unwrap(); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + assert!(a.is_some()); + assert!(b.is_none()); + assert!(d.is_some()); } // What happens to the count() and is_empty() methods @@ -824,4 +1844,99 @@ fn save_points_and_counts() { assert_eq!(wb.is_empty(), true); assert_eq!(wb.count(), 0); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.rollback_to_save_point().unwrap(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.pop_save_point().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.clear(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.write().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.rollback_to_save_point().unwrap(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.write().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.pop_save_point().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.clear(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); } diff --git a/components/error_code/Cargo.toml b/components/error_code/Cargo.toml index 3b7284faa63..b98fc8dfcb5 100644 --- a/components/error_code/Cargo.toml +++ b/components/error_code/Cargo.toml @@ -13,9 +13,9 @@ name = "error_code_gen" path = "bin.rs" [dependencies] -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = { version = "1.0", features = ["derive"] } -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/error_code/bin.rs b/components/error_code/bin.rs index ba6a21ac6fa..8f1ad087355 100644 --- a/components/error_code/bin.rs +++ b/components/error_code/bin.rs @@ -18,7 +18,7 @@ fn main() { storage::ALL_ERROR_CODES.iter(), ]; let path = Path::new("./etc/error_code.toml"); - let mut f = fs::File::create(&path).unwrap(); + let mut f = fs::File::create(path).unwrap(); err_codes .into_iter() .flatten() diff --git a/components/error_code/src/backup_stream.rs b/components/error_code/src/backup_stream.rs index fa11ff5b37d..a4b28b0e9ee 100644 --- a/components/error_code/src/backup_stream.rs +++ b/components/error_code/src/backup_stream.rs @@ -3,7 +3,7 @@ define_error_codes! { "KV:LogBackup:", - ETCD => ("ETCD", + ETCD => ("Etcd", "Error during requesting the meta store(etcd)", "Please check the connectivity between TiKV and PD."), PROTO => ("Proto", @@ -23,7 +23,7 @@ define_error_codes! { "Malformed metadata found.", "The metadata format is unexpected, please check the compatibility between TiKV / BR." ), - IO => ("IO", + IO => ("Io", "Error during doing Input / Output operations.", "This is a generic error, please check the error message for further information." ), @@ -35,18 +35,23 @@ define_error_codes! { "Error during scheduling internal task.", "This is an internal error, and may happen if there are too many changes to observe, please ask the community for help." ), - PD => ("PD", + PD => ("Pd", "Error during requesting the Placement Driver.", "Please check the connectivity between TiKV and PD." ), RAFTREQ => ("RaftReq", "Error happened when sending raft command.", - "This is an internal error, please ask the community for help." + "This is an internal error, most of them are happen while initial scanning and can be simply retried." ), RAFTSTORE => ("RaftStore", "Error happened reported from raft store.", "This is an internal error, please ask the community for help." ), + GRPC => ("gRPC", + "Error happened during executing gRPC", + "This error is often relative to the network, please check the network connection and network config, say, TLS config." + ), + OTHER => ("Unknown", "Some random error happens.", "This is an generic error, please check the error message for further information." diff --git a/components/error_code/src/causal_ts.rs b/components/error_code/src/causal_ts.rs index a5b2884a151..3f7f4e2a17e 100644 --- a/components/error_code/src/causal_ts.rs +++ b/components/error_code/src/causal_ts.rs @@ -4,9 +4,9 @@ define_error_codes!( "KV:CausalTs:", PD => ("PdClient", "", ""), - TSO => ("TSO", "", ""), - TSO_BATCH_USED_UP => ("TSO batch used up", "", ""), - BATCH_RENEW => ("Batch renew", "", ""), + TSO => ("Tso", "", ""), + TSO_BATCH_USED_UP => ("TsoBatchUsedUp", "", ""), + BATCH_RENEW => ("BatchRenew", "", ""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/error_code/src/cloud.rs b/components/error_code/src/cloud.rs index 63841761e7c..510481679dd 100644 --- a/components/error_code/src/cloud.rs +++ b/components/error_code/src/cloud.rs @@ -3,8 +3,8 @@ define_error_codes!( "KV:Cloud:", - IO => ("IO", "", ""), - SSL => ("SSL", "", ""), + IO => ("Io", "", ""), + SSL => ("Ssl", "", ""), PROTO => ("Proto", "", ""), UNKNOWN => ("Unknown", "", ""), TIMEOUT => ("Timeout", "", ""), diff --git a/components/error_code/src/encryption.rs b/components/error_code/src/encryption.rs index 069e98e3e6c..4204db84864 100644 --- a/components/error_code/src/encryption.rs +++ b/components/error_code/src/encryption.rs @@ -4,7 +4,7 @@ define_error_codes!( "KV:Encryption:", ROCKS => ("Rocks", "", ""), - IO => ("IO", "", ""), + IO => ("Io", "", ""), CRYPTER => ("Crypter", "", ""), PROTO => ("Proto", "", ""), UNKNOWN_ENCRYPTION => ("UnknownEncryption", "", ""), diff --git a/components/error_code/src/engine.rs b/components/error_code/src/engine.rs index d29d658cb69..4bb66f09753 100644 --- a/components/error_code/src/engine.rs +++ b/components/error_code/src/engine.rs @@ -6,8 +6,8 @@ define_error_codes!( ENGINE => ("Engine", "", ""), NOT_IN_RANGE => ("NotInRange", "", ""), PROTOBUF => ("Protobuf", "", ""), - IO => ("IO", "", ""), - CF_NAME => ("CFName", "", ""), + IO => ("Io", "", ""), + CF_NAME => ("CfName", "", ""), CODEC => ("Codec", "", ""), DATALOSS => ("DataLoss", "", ""), DATACOMPACTED => ("DataCompacted", "", "") diff --git a/components/error_code/src/lib.rs b/components/error_code/src/lib.rs index 8ad7f3e1f23..0747b3fd2fb 100644 --- a/components/error_code/src/lib.rs +++ b/components/error_code/src/lib.rs @@ -43,7 +43,7 @@ pub mod storage; use std::fmt::{self, Display, Formatter}; -#[derive(PartialEq, Eq, Debug, Clone, Copy)] +#[derive(PartialEq, Debug, Clone, Copy)] pub struct ErrorCode { pub code: &'static str, pub description: &'static str, diff --git a/components/error_code/src/pd.rs b/components/error_code/src/pd.rs index 60952e96922..782c4f3923b 100644 --- a/components/error_code/src/pd.rs +++ b/components/error_code/src/pd.rs @@ -1,15 +1,17 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. define_error_codes!( - "KV:PD:", + "KV:Pd:", - IO => ("IO", "", ""), + IO => ("Io", "", ""), CLUSTER_BOOTSTRAPPED => ("ClusterBootstraped", "", ""), CLUSTER_NOT_BOOTSTRAPPED => ("ClusterNotBootstraped", "", ""), INCOMPATIBLE => ("Imcompatible", "", ""), - GRPC => ("gRPC", "", ""), + GRPC => ("Grpc", "", ""), + STREAM_DISCONNECT => ("StreamDisconnect","",""), REGION_NOT_FOUND => ("RegionNotFound", "", ""), STORE_TOMBSTONE => ("StoreTombstone", "", ""), GLOBAL_CONFIG_NOT_FOUND => ("GlobalConfigNotFound","",""), + DATA_COMPACTED => ("DataCompacted","",""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 4d38de92284..35dfe564ef0 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -19,7 +19,7 @@ define_error_codes!( STALE_COMMAND => ("StaleCommand", "", ""), TRANSPORT => ("Transport", "", ""), COPROCESSOR => ("Coprocessor", "", ""), - IO => ("IO", "", ""), + IO => ("Io", "", ""), PROTOBUF => ("Protobuf", "", ""), ADDR_PARSE => ("AddressParse", "", ""), TIMEOUT => ("Timeout", "", ""), @@ -30,6 +30,9 @@ define_error_codes!( DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), + FLASHBACK_IN_PROGRESS => ("FlashbackInProgress", "", ""), + FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), + IS_WITNESS => ("IsWitness", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -64,6 +67,12 @@ impl ErrorCodeExt for errorpb::Error { DATA_IS_NOT_READY } else if self.has_recovery_in_progress() { RECOVERY_IN_PROGRESS + } else if self.has_flashback_in_progress() { + FLASHBACK_IN_PROGRESS + } else if self.has_flashback_not_prepared() { + FLASHBACK_NOT_PREPARED + } else if self.has_is_witness() { + IS_WITNESS } else { UNKNOWN } diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index e24209c92a1..001f4f146f6 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -1,13 +1,13 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. define_error_codes!( - "KV:SSTImporter:", + "KV:SstImporter:", IO => ("Io", "", ""), - GRPC => ("gRPC", "", ""), + GRPC => ("Grpc", "", ""), UUID => ("Uuid", "", ""), FUTURE => ("Future", "", ""), - ROCKSDB => ("RocksDB", "", ""), + ROCKSDB => ("RocksDb", "", ""), PARSE_INT_ERROR => ("ParseIntError", "", ""), FILE_EXISTS => ("FileExists", "", ""), FILE_CORRUPTED => ("FileCorrupted", "", ""), @@ -21,5 +21,6 @@ define_error_codes!( TTL_NOT_ENABLED => ("TtlNotEnabled", "", ""), TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), INCOMPATIBLE_API_VERSION => ("IncompatibleApiVersion", "", ""), - INVALID_KEY_MODE => ("InvalidKeyMode", "", "") + INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), + RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", "") ); diff --git a/components/error_code/src/storage.rs b/components/error_code/src/storage.rs index 5336ab80bb0..8b41e7a797e 100644 --- a/components/error_code/src/storage.rs +++ b/components/error_code/src/storage.rs @@ -10,17 +10,18 @@ define_error_codes!( SCHED_TOO_BUSY => ("SchedTooBusy", "", ""), GC_WORKER_TOO_BUSY => ("GcWorkerTooBusy", "", ""), KEY_TOO_LARGE => ("KeyTooLarge", "", ""), - INVALID_CF => ("InvalidCF", "", ""), - CF_DEPRECATED => ("CFDeprecated", "", ""), + INVALID_CF => ("InvalidCf", "", ""), + CF_DEPRECATED => ("CfDeprecated", "", ""), TTL_NOT_ENABLED => ("TtlNotEnabled", "", ""), TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), PROTOBUF => ("Protobuf", "", ""), - INVALID_TXN_TSO => ("INVALIDTXNTSO", "", ""), + INVALID_TXN_TSO => ("InvalidTxnTso", "", ""), INVALID_REQ_RANGE => ("InvalidReqRange", "", ""), BAD_FORMAT_LOCK => ("BadFormatLock", "", ""), BAD_FORMAT_WRITE => ("BadFormatWrite", "",""), KEY_IS_LOCKED => ("KeyIsLocked", "", ""), MAX_TIMESTAMP_NOT_SYNCED => ("MaxTimestampNotSynced", "", ""), + FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), API_VERSION_NOT_MATCHED => ("ApiVersionNotMatched", "", ""), INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), @@ -40,6 +41,9 @@ define_error_codes!( COMMIT_TS_TOO_LARGE => ("CommitTsTooLarge", "", ""), ASSERTION_FAILED => ("AssertionFailed", "", ""), + LOCK_IF_EXISTS_FAILED => ("LockIfExistsFailed", "", ""), + + PRIMARY_MISMATCH => ("PrimaryMismatch", "", ""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index 049f8ab2e43..4ff13e564ff 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -16,19 +16,20 @@ cloud-storage-grpc = [ failpoints = ["fail/failpoints"] [dependencies] +async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } async-trait = "0.1" bytes = "1.0" -encryption = { path = "../encryption" } -engine_traits = { path = "../engine_traits" } +encryption = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" ffi-support = { optional = true, version = "0.4.2" } -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = "0.3" futures-executor = "0.3" futures-io = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", optional = true, default-features = false, features = ["openssl-vendored"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true, optional = true } +kvproto = { workspace = true } lazy_static = "1.3" libloading = { optional = true, version = "0.7.0" } openssl = "0.10" @@ -36,11 +37,11 @@ prometheus = { version = "0.13", default-features = false, features = ["nightly" protobuf = { optional = true, version = "2" } rand = "0.8" rusoto_core = "0.46.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +slog-global = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "fs", "process"] } tokio-util = { version = "0.7", features = ["compat"] } url = "2.0" diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml index d67e2b7a15f..61e9bfa58df 100644 --- a/components/external_storage/export/Cargo.toml +++ b/components/external_storage/export/Cargo.toml @@ -40,45 +40,45 @@ cloud-storage-grpc = [ "futures", "futures-executor", "libc", - "signal", + "signal-hook", "slog", "slog-global", "slog-term", "tokio", "tokio-util", - "nix", ] [dependencies] -aws = { optional = true, path = "../../cloud/aws", default-features = false } -azure = { optional = true, path = "../../cloud/azure", default-features = false } -cloud = { path = "../../cloud", default_features = false } -lazy_static = { optional = true, version = "1.3" } -gcp = { optional = true, path = "../../cloud/gcp", default-features = false } -grpcio = { version = "0.10", optional = true, default-features = false, features = ["openssl-vendored"] } -encryption = { path = "../../encryption", default-features = false } -external_storage = { path = "../", default-features = false } -engine_traits = { path = "../../engine_traits", default-features = false } +async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } +async-trait = "0.1" +aws = { optional = true, workspace = true } +azure = { optional = true, workspace = true } +cloud = { workspace = true } +encryption = { workspace = true } +engine_traits = { workspace = true } +external_storage = { workspace = true } ffi-support = { optional = true, version = "0.4.2" } -file_system = { optional = true, path = "../../file_system" } +file_system = { workspace = true, optional = true } futures = { optional = true, version = "0.3" } futures-executor = { optional = true, version = "0.3" } futures-io = { version = "0.3" } futures-util = { version = "0.3", default-features = false, features = ["io"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +gcp = { optional = true, workspace = true } +grpcio = { workspace = true, optional = true } +kvproto = { workspace = true } +lazy_static = { optional = true, version = "1.3" } libloading = { optional = true, version = "0.7.0" } once_cell = { optional = true, version = "1.3.1" } protobuf = { optional = true, version = "2" } slog-global = { optional = true, version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util" } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt", "net"], optional = true } tokio-util = { version = "0.7", features = ["compat"], optional = true } url = "2.0" -async-trait = "0.1" [dev-dependencies] -matches = "0.1.8" futures-util = { version = "0.3", default-features = false, features = ["io"] } +matches = "0.1.8" rust-ini = "0.14.0" structopt = "0.3" tempfile = "3.1" @@ -89,8 +89,8 @@ name = "scli" path = "examples/scli.rs" [target.'cfg(unix)'.dependencies] -nix = { optional = true, version = "0.23" } -signal = { optional = true, version = "0.6" } +nix = { optional = true, version = "0.24" } +signal-hook = { optional = true, version = "0.3" } libc = { optional = true, version = "0.2" } slog = { optional = true, version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-term = { optional = true, version = "2.4" } diff --git a/components/external_storage/export/examples/scli.rs b/components/external_storage/export/examples/scli.rs index e98e24ab452..0ab54721b29 100644 --- a/components/external_storage/export/examples/scli.rs +++ b/components/external_storage/export/examples/scli.rs @@ -6,9 +6,15 @@ use std::{ path::Path, }; +#[cfg(feature = "cloud-azure")] +use external_storage_export::make_azblob_backend; +#[cfg(feature = "cloud-gcp")] +use external_storage_export::make_gcs_backend; +#[cfg(feature = "cloud-aws")] +use external_storage_export::make_s3_backend; use external_storage_export::{ - create_storage, make_azblob_backend, make_cloud_backend, make_gcs_backend, make_hdfs_backend, - make_local_backend, make_noop_backend, make_s3_backend, ExternalStorage, UnpinReader, + create_storage, make_cloud_backend, make_hdfs_backend, make_local_backend, make_noop_backend, + ExternalStorage, UnpinReader, }; use futures_util::io::{copy, AllowStdIo}; use ini::ini::Ini; @@ -144,7 +150,10 @@ fn create_s3_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - Ok(make_s3_backend(config)) + #[cfg(feature = "cloud-aws")] + return Ok(make_s3_backend(config)); + #[cfg(not(feature = "cloud-aws"))] + return Err(Error::new(ErrorKind::Other, "missing feature")); } fn create_gcs_storage(opt: &Opt) -> Result { @@ -164,7 +173,10 @@ fn create_gcs_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - Ok(make_gcs_backend(config)) + #[cfg(feature = "cloud-gcp")] + return Ok(make_gcs_backend(config)); + #[cfg(not(feature = "cloud-gcp"))] + return Err(Error::new(ErrorKind::Other, "missing feature")); } fn create_azure_storage(opt: &Opt) -> Result { @@ -200,7 +212,10 @@ fn create_azure_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - Ok(make_azblob_backend(config)) + #[cfg(feature = "cloud-azure")] + return Ok(make_azblob_backend(config)); + #[cfg(not(feature = "cloud-azure"))] + return Err(Error::new(ErrorKind::Other, "missing feature")); } fn process() -> Result<()> { diff --git a/components/external_storage/export/src/bin/tikv-cloud-storage.rs b/components/external_storage/export/src/bin/tikv-cloud-storage.rs index 3011a5079d1..07cd8507948 100644 --- a/components/external_storage/export/src/bin/tikv-cloud-storage.rs +++ b/components/external_storage/export/src/bin/tikv-cloud-storage.rs @@ -33,16 +33,19 @@ fn main() { #[cfg(unix)] mod wait { use libc::c_int; - use nix::sys::signal::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}; - use signal::trap::Trap; + use signal_hook::{ + consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, + iterator::Signals, + Signals, + }; use slog_global::info; pub fn for_signal() { - let trap = Trap::trap(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]); - for sig in trap { - match sig { - SIGUSR1 | SIGTERM | SIGINT | SIGHUP => { - info!("receive signal {}, stopping server...", sig as c_int); + let mut signals = Signals::new(&[SIGTERM, SIGINT, SIGHUP]).unwrap(); + for signal in &mut signals { + match signal { + SIGTERM | SIGINT | SIGHUP => { + info!("receive signal {}, stopping server...", signal); break; } // TODO: handle more signals diff --git a/components/external_storage/export/src/dylib.rs b/components/external_storage/export/src/dylib.rs index a02f5f2fade..308973de95e 100644 --- a/components/external_storage/export/src/dylib.rs +++ b/components/external_storage/export/src/dylib.rs @@ -188,7 +188,7 @@ pub mod staticlib { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index b9d4b098394..ad31dc363ae 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -1,13 +1,9 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -//! To use External storage with protobufs as an application, import this module. -//! external_storage contains the actual library code +//! To use External storage with protobufs as an application, import this +//! module. external_storage contains the actual library code //! Cloud provider backends are under components/cloud -use std::{ - io::{self, Write}, - path::Path, - sync::Arc, -}; +use std::{io, path::Path, sync::Arc}; use async_trait::async_trait; #[cfg(feature = "cloud-aws")] @@ -18,28 +14,25 @@ pub use azure::{AzureStorage, Config as AzureConfig}; use cloud::blob::BlobConfig; use cloud::blob::{BlobStorage, PutResource}; use encryption::DataKeyManager; -use engine_traits::FileEncryptionInfo; #[cfg(feature = "cloud-storage-dylib")] use external_storage::dylib_client; #[cfg(feature = "cloud-storage-grpc")] use external_storage::grpc_client; -use external_storage::{encrypt_wrap_reader, record_storage_create, BackendConfig, HdfsStorage}; pub use external_storage::{ - read_external_storage_into_file, ExternalStorage, LocalStorage, NoopStorage, UnpinReader, + compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_info_buff, + read_external_storage_into_file, record_storage_create, BackendConfig, ExternalData, + ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, + MIN_READ_SPEED, }; -use futures_io::AsyncRead; #[cfg(feature = "cloud-gcp")] -pub use gcp::{Config as GCSConfig, GCSStorage}; +pub use gcp::{Config as GcsConfig, GcsStorage}; pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; #[cfg(any(feature = "cloud-gcp", feature = "cloud-aws", feature = "cloud-azure"))] use kvproto::brpb::{AzureBlobStorage, Gcs, S3}; use kvproto::brpb::{CloudDynamic, Noop, StorageBackend}; +use tikv_util::time::{Instant, Limiter}; #[cfg(feature = "cloud-storage-dylib")] use tikv_util::warn; -use tikv_util::{ - stream::block_on_external_io, - time::{Instant, Limiter}, -}; #[cfg(feature = "cloud-storage-dylib")] use crate::dylib; @@ -55,8 +48,9 @@ pub fn create_storage( } } -// when the flag cloud-storage-dylib or cloud-storage-grpc is set create_storage is automatically wrapped with a client -// This function is used by the library/server to avoid any wrapping +// when the flag cloud-storage-dylib or cloud-storage-grpc is set create_storage +// is automatically wrapped with a client This function is used by the +// library/server to avoid any wrapping pub fn create_storage_no_client( storage_backend: &StorageBackend, config: BackendConfig, @@ -138,7 +132,7 @@ fn create_config(backend: &Backend) -> Option>> { } #[cfg(feature = "cloud-gcp")] Backend::Gcs(config) => { - let conf = GCSConfig::from_input(config.clone()); + let conf = GcsConfig::from_input(config.clone()); Some(conf.map(|c| Box::new(c) as Box)) } #[cfg(feature = "cloud-azure")] @@ -154,7 +148,7 @@ fn create_config(backend: &Backend) -> Option>> { } #[cfg(feature = "cloud-gcp")] "gcp" | "gcs" => { - let conf = GCSConfig::from_cloud_dynamic(&dyn_backend); + let conf = GcsConfig::from_cloud_dynamic(&dyn_backend); Some(conf.map(|c| Box::new(c) as Box)) } #[cfg(feature = "cloud-azure")] @@ -182,7 +176,9 @@ fn create_backend_inner( Backend::Hdfs(hdfs) => { Box::new(HdfsStorage::new(&hdfs.remote, backend_config.hdfs_config)?) } - Backend::Noop(_) => Box::new(NoopStorage::default()) as Box, + Backend::Noop(_) => { + Box::::default() as Box + } #[cfg(feature = "cloud-aws")] Backend::S3(config) => { let mut s = S3Storage::from_input(config.clone())?; @@ -190,14 +186,14 @@ fn create_backend_inner( blob_store(s) } #[cfg(feature = "cloud-gcp")] - Backend::Gcs(config) => blob_store(GCSStorage::from_input(config.clone())?), + Backend::Gcs(config) => blob_store(GcsStorage::from_input(config.clone())?), #[cfg(feature = "cloud-azure")] Backend::AzureBlobStorage(config) => blob_store(AzureStorage::from_input(config.clone())?), Backend::CloudDynamic(dyn_backend) => match dyn_backend.provider_name.as_str() { #[cfg(feature = "cloud-aws")] "aws" | "s3" => blob_store(S3Storage::from_cloud_dynamic(dyn_backend)?), #[cfg(feature = "cloud-gcp")] - "gcp" | "gcs" => blob_store(GCSStorage::from_cloud_dynamic(dyn_backend)?), + "gcp" | "gcs" => blob_store(GcsStorage::from_cloud_dynamic(dyn_backend)?), #[cfg(feature = "cloud-azure")] "azure" | "azblob" => blob_store(AzureStorage::from_cloud_dynamic(dyn_backend)?), _ => { @@ -304,13 +300,13 @@ impl std::ops::Deref for BlobStore { } } -pub struct EncryptedExternalStorage { +pub struct EncryptedExternalStorage { pub key_manager: Arc, - pub storage: Box, + pub storage: S, } #[async_trait] -impl ExternalStorage for EncryptedExternalStorage { +impl ExternalStorage for EncryptedExternalStorage { fn name(&self) -> &'static str { self.storage.name() } @@ -320,32 +316,49 @@ impl ExternalStorage for EncryptedExternalStorage { async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { self.storage.write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.storage.read(name) } - fn restore( + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { + self.storage.read_part(name, off, len) + } + async fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, expected_length: u64, - expected_sha256: Option>, speed_limiter: &Limiter, - file_crypter: Option, + restore_config: RestoreConfig, ) -> io::Result<()> { - let reader = self.read(storage_name); - let file_writer: &mut dyn Write = - &mut self.key_manager.create_file_for_write(&restore_name)?; + let RestoreConfig { + range, + compression_type, + expected_sha256, + file_crypter, + } = restore_config; + + let reader = { + let inner = if let Some((off, len)) = range { + self.read_part(storage_name, off, len) + } else { + self.read(storage_name) + }; + + compression_reader_dispatcher(compression_type, inner)? + }; + let file_writer = self.key_manager.create_file_for_write(&restore_name)?; let min_read_speed: usize = 8192; let mut input = encrypt_wrap_reader(file_crypter, reader)?; - block_on_external_io(read_external_storage_into_file( + read_external_storage_into_file( &mut input, file_writer, speed_limiter, expected_length, expected_sha256, min_read_speed, - )) + ) + .await } } @@ -363,7 +376,11 @@ impl ExternalStorage for BlobStore { .await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { (**self).get(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { + (**self).get_part(name, off, len) + } } diff --git a/components/external_storage/export/src/request.rs b/components/external_storage/export/src/request.rs index eaf618746c0..5623c0732d7 100644 --- a/components/external_storage/export/src/request.rs +++ b/components/external_storage/export/src/request.rs @@ -58,7 +58,8 @@ pub async fn restore_inner( expected_length: u64, ) -> io::Result<()> { let storage = create_storage_no_client(&storage_backend)?; - // TODO: support encryption. The service must be launched with or sent a DataKeyManager + // TODO: support encryption. The service must be launched with or sent a + // DataKeyManager let output: &mut dyn io::Write = &mut File::create(file_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with diff --git a/components/external_storage/src/dylib_client.rs b/components/external_storage/src/dylib_client.rs index 6d6dc35cf8a..9e2748c2011 100644 --- a/components/external_storage/src/dylib_client.rs +++ b/components/external_storage/src/dylib_client.rs @@ -92,7 +92,7 @@ impl ExternalStorage for ExternalStorageClient { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/src/grpc_client.rs b/components/external_storage/src/grpc_client.rs index 3d715dfcd47..e836d8fb58a 100644 --- a/components/external_storage/src/grpc_client.rs +++ b/components/external_storage/src/grpc_client.rs @@ -95,7 +95,7 @@ impl ExternalStorage for ExternalStorageClient { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/src/hdfs.rs b/components/external_storage/src/hdfs.rs index 175104d06cb..17556490320 100644 --- a/components/external_storage/src/hdfs.rs +++ b/components/external_storage/src/hdfs.rs @@ -7,7 +7,7 @@ use tokio::{io as async_io, process::Command}; use tokio_util::compat::FuturesAsyncReadCompatExt; use url::Url; -use crate::{ExternalStorage, UnpinReader}; +use crate::{ExternalData, ExternalStorage, UnpinReader}; /// Convert `hdfs:///path` to `/path` fn try_convert_to_path(url: &Url) -> &str { @@ -101,7 +101,7 @@ impl ExternalStorage for HdfsStorage { } cmd_with_args.extend([&cmd_path, "dfs", "-put", "-", path]); info!("calling hdfs"; "cmd" => ?cmd_with_args); - let mut hdfs_cmd = Command::new(&cmd_with_args[0]) + let mut hdfs_cmd = Command::new(cmd_with_args[0]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) @@ -131,7 +131,11 @@ impl ExternalStorage for HdfsStorage { } } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> ExternalData<'_> { + unimplemented!("currently only HDFS export is implemented") + } + + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { unimplemented!("currently only HDFS export is implemented") } } diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 477b0a39a64..211a1b52ad6 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -9,29 +9,31 @@ extern crate slog_global; extern crate tikv_alloc; use std::{ - fs, io::{self, Write}, marker::Unpin, sync::Arc, time::Duration, }; +use async_compression::futures::bufread::ZstdDecoder; use async_trait::async_trait; -use encryption::{encryption_method_from_db_encryption_method, DecrypterReader, Iv}; +use encryption::{from_engine_encryption_method, DecrypterReader, Iv}; use engine_traits::FileEncryptionInfo; use file_system::File; +use futures::io::BufReader; use futures_io::AsyncRead; use futures_util::AsyncReadExt; +use kvproto::brpb::CompressionType; use openssl::hash::{Hasher, MessageDigest}; use tikv_util::{ - stream::{block_on_external_io, READ_BUF_SIZE}, + stream::READ_BUF_SIZE, time::{Instant, Limiter}, }; use tokio::time::timeout; mod hdfs; pub use hdfs::{HdfsConfig, HdfsStorage}; -mod local; +pub mod local; pub use local::LocalStorage; mod noop; pub use noop::NoopStorage; @@ -51,17 +53,50 @@ pub fn record_storage_create(start: Instant, storage: &dyn ExternalStorage) { } /// UnpinReader is a simple wrapper for AsyncRead + Unpin + Send. -/// This wrapper would remove the lifetime at the argument of the generted async function -/// in order to make rustc happy. (And reduce the length of signture of write.) -/// see https://github.com/rust-lang/rust/issues/63033 +/// This wrapper would remove the lifetime at the argument of the generated +/// async function in order to make rustc happy. (And reduce the length of +/// signature of write.) see https://github.com/rust-lang/rust/issues/63033 pub struct UnpinReader(pub Box); +pub type ExternalData<'a> = Box; + #[derive(Debug, Default)] pub struct BackendConfig { pub s3_multi_part_size: usize, pub hdfs_config: HdfsConfig, } +#[derive(Debug, Default)] +pub struct RestoreConfig { + pub range: Option<(u64, u64)>, + pub compression_type: Option, + pub expected_sha256: Option>, + pub file_crypter: Option, +} + +/// a reader dispatcher for different compression type. +pub fn compression_reader_dispatcher( + compression_type: Option, + inner: ExternalData<'_>, +) -> io::Result> { + match compression_type { + Some(c) => match c { + // The log files generated from TiKV v6.2.0 use the default value (0). + // So here regard Unkown(0) as uncompressed type. + CompressionType::Unknown => Ok(inner), + CompressionType::Zstd => Ok(Box::new(ZstdDecoder::new(BufReader::new(inner)))), + _ => Err(io::Error::new( + io::ErrorKind::Other, + format!( + "the compression type is unimplemented, compression type id {:?}", + c + ), + )), + }, + None => Ok(inner), + } +} + /// An abstraction of an external storage. // TODO: these should all be returning a future (i.e. async fn). #[async_trait] @@ -74,45 +109,53 @@ pub trait ExternalStorage: 'static + Send + Sync { async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()>; /// Read all contents of the given path. - fn read(&self, name: &str) -> Box; + fn read(&self, name: &str) -> ExternalData<'_>; + + /// Read part of contents of the given path. + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_>; /// Read from external storage and restore to the given path - fn restore( + async fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, expected_length: u64, - expected_sha256: Option>, speed_limiter: &Limiter, - file_crypter: Option, + restore_config: RestoreConfig, ) -> io::Result<()> { - let reader = self.read(storage_name); - if let Some(p) = restore_name.parent() { - // try create all parent dirs from the path (optional). - fs::create_dir_all(p).or_else(|e| { - if e.kind() == io::ErrorKind::AlreadyExists { - Ok(()) - } else { - Err(e) - } - })?; - } - let output: &mut dyn Write = &mut File::create(restore_name)?; + let RestoreConfig { + range, + compression_type, + expected_sha256, + file_crypter, + } = restore_config; + + let reader = { + let inner = if let Some((off, len)) = range { + self.read_part(storage_name, off, len) + } else { + self.read(storage_name) + }; + + compression_reader_dispatcher(compression_type, inner)? + }; + let output = File::create(restore_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with // a "TimedOut" error. // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) let min_read_speed: usize = 8192; - let mut input = encrypt_wrap_reader(file_crypter, reader)?; + let input = encrypt_wrap_reader(file_crypter, reader)?; - block_on_external_io(read_external_storage_into_file( - &mut input, + read_external_storage_into_file( + input, output, speed_limiter, expected_length, expected_sha256, min_read_speed, - )) + ) + .await } } @@ -130,9 +173,32 @@ impl ExternalStorage for Arc { (**self).write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { (**self).read(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { + (**self).read_part(name, off, len) + } + + async fn restore( + &self, + storage_name: &str, + restore_name: std::path::PathBuf, + expected_length: u64, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> io::Result<()> { + self.as_ref() + .restore( + storage_name, + restore_name, + expected_length, + speed_limiter, + restore_config, + ) + .await + } } #[async_trait] @@ -149,21 +215,44 @@ impl ExternalStorage for Box { self.as_ref().write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.as_ref().read(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { + self.as_ref().read_part(name, off, len) + } + + async fn restore( + &self, + storage_name: &str, + restore_name: std::path::PathBuf, + expected_length: u64, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> io::Result<()> { + self.as_ref() + .restore( + storage_name, + restore_name, + expected_length, + speed_limiter, + restore_config, + ) + .await + } } /// Wrap the reader with file_crypter. /// Return the reader directly if file_crypter is None. -pub fn encrypt_wrap_reader<'a>( +pub fn encrypt_wrap_reader( file_crypter: Option, - reader: Box, -) -> io::Result> { + reader: ExternalData<'_>, +) -> io::Result> { let input = match file_crypter { Some(x) => Box::new(DecrypterReader::new( reader, - encryption_method_from_db_encryption_method(x.method), + from_engine_encryption_method(x.method), &x.key, Iv::from_slice(&x.iv)?, )?), @@ -173,14 +262,18 @@ pub fn encrypt_wrap_reader<'a>( Ok(input) } -pub async fn read_external_storage_into_file( - input: &mut (dyn AsyncRead + Unpin), - output: &mut dyn Write, +pub async fn read_external_storage_into_file( + mut input: In, + mut output: Out, speed_limiter: &Limiter, expected_length: u64, expected_sha256: Option>, min_read_speed: usize, -) -> io::Result<()> { +) -> io::Result<()> +where + In: AsyncRead + Unpin, + Out: Write, +{ let dur = Duration::from_secs((READ_BUF_SIZE / min_read_speed) as u64); // do the I/O copy from external_storage to the local file. @@ -248,3 +341,88 @@ pub async fn read_external_storage_into_file( Ok(()) } + +pub const MIN_READ_SPEED: usize = 8192; + +pub async fn read_external_storage_info_buff( + reader: &mut (dyn AsyncRead + Unpin + Send), + speed_limiter: &Limiter, + expected_length: u64, + expected_sha256: Option>, + min_read_speed: usize, +) -> io::Result> { + // the minimum speed of reading data, in bytes/second. + // if reading speed is slower than this rate, we will stop with + // a "TimedOut" error. + // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) + let read_speed = if min_read_speed > 0 { + min_read_speed + } else { + MIN_READ_SPEED + }; + let dur = Duration::from_secs((READ_BUF_SIZE / read_speed) as u64); + let mut output = Vec::new(); + let mut buffer = vec![0u8; READ_BUF_SIZE]; + + loop { + // separate the speed limiting from actual reading so it won't + // affect the timeout calculation. + let bytes_read = timeout(dur, reader.read(&mut buffer)) + .await + .map_err(|_| io::ErrorKind::TimedOut)??; + if bytes_read == 0 { + break; + } + + speed_limiter.consume(bytes_read).await; + output.append(&mut buffer[..bytes_read].to_vec()); + } + + // check length of file + if expected_length > 0 && output.len() != expected_length as usize { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "length not match, downloaded size {}, expected {}", + output.len(), + expected_length + ), + )); + } + // check sha256 of file + if let Some(sha256) = expected_sha256 { + let mut hasher = Hasher::new(MessageDigest::sha256()).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher failed to init: {}", err), + ) + })?; + hasher.update(&output).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher udpate failed: {}", err), + ) + })?; + + let cal_sha256 = hasher.finish().map_or_else( + |err| { + Err(io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher finish failed: {}", err), + )) + }, + |bytes| Ok(bytes.to_vec()), + )?; + if !sha256.eq(&cal_sha256) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "sha256 not match, expect: {:?}, calculate: {:?}", + sha256, cal_sha256, + ), + )); + } + } + + Ok(output) +} diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 5fd899b17f9..0bf6be65107 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -2,15 +2,13 @@ use std::{ fs::File as StdFile, - io, - marker::Unpin, + io::{self, BufReader, Read, Seek}, path::{Path, PathBuf}, sync::Arc, }; use async_trait::async_trait; use futures::io::AllowStdIo; -use futures_io::AsyncRead; use futures_util::stream::TryStreamExt; use rand::Rng; use tikv_util::stream::error_stream; @@ -54,7 +52,7 @@ fn url_for(base: &Path) -> url::Url { u } -const STORAGE_NAME: &str = "local"; +pub const STORAGE_NAME: &str = "local"; #[async_trait] impl ExternalStorage for LocalStorage { @@ -84,8 +82,9 @@ impl ExternalStorage for LocalStorage { )); } // create the parent dir if there isn't one. - // note: we may write to arbitrary directory here if the path contains things like '../' - // but internally the file name should be fully controlled by TiKV, so maybe it is OK? + // note: we may write to arbitrary directory here if the path contains things + // like '../' but internally the file name should be fully controlled by + // TiKV, so maybe it is OK? if let Some(parent) = Path::new(name).parent() { fs::create_dir_all(self.base.join(parent)) .await @@ -100,12 +99,12 @@ impl ExternalStorage for LocalStorage { } })?; } - // Sanitize check, do not save file if it is already exist. + + // Because s3 could support writing(put_object) a existed object. + // For the interface consistent with s3, local storage need also support write a + // existed file. if fs::metadata(self.base.join(name)).await.is_ok() { - return Err(io::Error::new( - io::ErrorKind::AlreadyExists, - format!("[{}] is already exists in {}", name, self.base.display()), - )); + info!("[{}] is already exists in {}", name, self.base.display()); } let tmp_path = self.tmp_path(Path::new(name)); let mut tmp_f = File::create(&tmp_path).await?; @@ -118,16 +117,34 @@ impl ExternalStorage for LocalStorage { self.base_dir.sync_all().await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> crate::ExternalData<'_> { debug!("read file from local storage"; "name" => %name, "base" => %self.base.display()); - // We used std i/o here for removing the requirement of tokio reactor when restoring. + // We used std i/o here for removing the requirement of tokio reactor when + // restoring. // FIXME: when restore side get ready, use tokio::fs::File for returning. match StdFile::open(self.base.join(name)) { Ok(file) => Box::new(AllowStdIo::new(file)) as _, Err(e) => Box::new(error_stream(e).into_async_read()) as _, } } + + fn read_part(&self, name: &str, off: u64, len: u64) -> crate::ExternalData<'_> { + debug!("read part of file from local storage"; + "name" => %name, "off" => %off, "len" => %len, "base" => %self.base.display()); + + let mut file = match StdFile::open(self.base.join(name)) { + Ok(file) => file, + Err(e) => return Box::new(error_stream(e).into_async_read()) as _, + }; + match file.seek(std::io::SeekFrom::Start(off)) { + Ok(_) => (), + Err(e) => return Box::new(error_stream(e).into_async_read()) as _, + }; + let reader = BufReader::new(file); + let take = reader.take(len); + Box::new(AllowStdIo::new(take)) as _ + } } #[cfg(test)] @@ -215,4 +232,26 @@ mod tests { fn test_url_of_backend() { assert_eq!(url_for(Path::new("/tmp/a")).to_string(), "local:///tmp/a"); } + + #[tokio::test] + async fn test_write_existed_file() { + let temp_dir = Builder::new().tempdir().unwrap(); + let path = temp_dir.path(); + let ls = LocalStorage::new(path).unwrap(); + + let filename = "existed.file"; + let buf1: &[u8] = b"pingcap"; + let buf2: &[u8] = b"tikv"; + ls.write(filename, UnpinReader(Box::new(buf1)), buf1.len() as _) + .await + .unwrap(); + ls.write(filename, UnpinReader(Box::new(buf2)), buf2.len() as _) + .await + .unwrap(); + + let mut read_buff: Vec = Vec::new(); + ls.read(filename).read_to_end(&mut read_buff).await.unwrap(); + assert_eq!(read_buff.len(), 4); + assert_eq!(&read_buff, buf2); + } } diff --git a/components/external_storage/src/metrics.rs b/components/external_storage/src/metrics.rs index 1cb0c37cfa8..99dabca158e 100644 --- a/components/external_storage/src/metrics.rs +++ b/components/external_storage/src/metrics.rs @@ -8,7 +8,7 @@ lazy_static! { "tikv_external_storage_create_seconds", "Bucketed histogram of creating external storage duration", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); } diff --git a/components/external_storage/src/noop.rs b/components/external_storage/src/noop.rs index cb590ca6e44..50e9c43c7bc 100644 --- a/components/external_storage/src/noop.rs +++ b/components/external_storage/src/noop.rs @@ -1,14 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::marker::Unpin; - use async_trait::async_trait; -use futures_io::AsyncRead; use tokio::io; use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; use super::ExternalStorage; -use crate::UnpinReader; +use crate::{ExternalData, UnpinReader}; /// A storage saves files into void. /// It is mainly for test use. @@ -44,7 +41,11 @@ impl ExternalStorage for NoopStorage { Ok(()) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> ExternalData<'_> { + Box::new(io::empty().compat()) + } + + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { Box::new(io::empty().compat()) } } diff --git a/components/external_storage/src/request.rs b/components/external_storage/src/request.rs index ef4fa54e448..7f1a81d49b7 100644 --- a/components/external_storage/src/request.rs +++ b/components/external_storage/src/request.rs @@ -24,7 +24,8 @@ pub fn write_sender( // currently it is copying into an intermediate buffer // Writing to a file here uses up disk space // But as a positive it gets the backup data out of the DB the fastest - // Currently this waits for the file to be completely written before sending to storage + // Currently this waits for the file to be completely written before sending to + // storage runtime.enter(|| { block_on(async { let msg = |action: &str| format!("{} file {:?}", action, &file_path); diff --git a/components/file_system/Cargo.toml b/components/file_system/Cargo.toml index aa1cb56a991..2252ebc3f1b 100644 --- a/components/file_system/Cargo.toml +++ b/components/file_system/Cargo.toml @@ -8,25 +8,24 @@ publish = false bcc-iosnoop = ["bcc"] [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } crc32fast = "1.2" crossbeam-utils = "0.8.0" fs2 = "0.4" lazy_static = "1.3" libc = "0.2" -nix = "0.23" -online_config = { path = "../online_config" } +online_config = { workspace = true } openssl = "0.10" parking_lot = "0.12" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" rand = "0.8" serde = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } strum = { version = "0.20", features = ["derive"] } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time"] } [dev-dependencies] diff --git a/components/file_system/src/file.rs b/components/file_system/src/file.rs index 93269d5da10..c072b8f852f 100644 --- a/components/file_system/src/file.rs +++ b/components/file_system/src/file.rs @@ -13,12 +13,13 @@ use std::{ // Extention Traits use fs2::FileExt; -use super::{get_io_rate_limiter, get_io_type, IOOp, IORateLimiter}; +use super::{get_io_rate_limiter, get_io_type, IoOp, IoRateLimiter}; -/// A wrapper around `std::fs::File` with capability to track and regulate IO flow. +/// A wrapper around `std::fs::File` with capability to track and regulate IO +/// flow. pub struct File { inner: fs::File, - limiter: Option>, + limiter: Option>, } impl Debug for File { @@ -39,7 +40,7 @@ impl File { #[cfg(test)] pub fn open_with_limiter>( path: P, - limiter: Option>, + limiter: Option>, ) -> io::Result { let inner = fs::File::open(path)?; Ok(File { inner, limiter }) @@ -56,7 +57,7 @@ impl File { #[cfg(test)] pub fn create_with_limiter>( path: P, - limiter: Option>, + limiter: Option>, ) -> io::Result { let inner = fs::File::create(path)?; Ok(File { inner, limiter }) @@ -104,7 +105,7 @@ impl Read for File { let mut remains = buf.len(); let mut pos = 0; while remains > 0 { - let allowed = limiter.request(get_io_type(), IOOp::Read, remains); + let allowed = limiter.request(get_io_type(), IoOp::Read, remains); let read = self.inner.read(&mut buf[pos..pos + allowed])?; pos += read; remains -= read; @@ -131,7 +132,7 @@ impl Write for File { let mut remains = buf.len(); let mut pos = 0; while remains > 0 { - let allowed = limiter.request(get_io_type(), IOOp::Write, remains); + let allowed = limiter.request(get_io_type(), IoOp::Write, remains); let written = self.inner.write(&buf[pos..pos + allowed])?; pos += written; remains -= written; @@ -261,7 +262,7 @@ mod tests { .prefix("test_instrumented_file") .tempdir() .unwrap(); - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); // make sure read at most one bytes at a time limiter.set_io_rate_limit(20 /* 1s / refill_period */); let stats = limiter.statistics().unwrap(); @@ -269,24 +270,24 @@ mod tests { let tmp_file = tmp_dir.path().join("instrumented.txt"); let content = String::from("drink full and descend"); { - let _guard = WithIOType::new(IOType::ForegroundWrite); + let _guard = WithIoType::new(IoType::ForegroundWrite); let mut f = File::create_with_limiter(&tmp_file, Some(limiter.clone())).unwrap(); f.write_all(content.as_bytes()).unwrap(); f.sync_all().unwrap(); assert_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write), + stats.fetch(IoType::ForegroundWrite, IoOp::Write), content.len() ); } { - let _guard = WithIOType::new(IOType::Export); + let _guard = WithIoType::new(IoType::Export); let mut buffer = String::new(); let mut f = File::open_with_limiter(&tmp_file, Some(limiter)).unwrap(); assert_eq!(f.read_to_string(&mut buffer).unwrap(), content.len()); assert_eq!(buffer, content); // read_to_string only exit when file.read() returns zero, which means // it requires two EOF reads to finish the call. - assert_eq!(stats.fetch(IOType::Export, IOOp::Read), content.len() + 2); + assert_eq!(stats.fetch(IoType::Export, IoOp::Read), content.len() + 2); } } diff --git a/components/file_system/src/io_stats/biosnoop.rs b/components/file_system/src/io_stats/biosnoop.rs index cbe622f78f8..6b804bfed87 100644 --- a/components/file_system/src/io_stats/biosnoop.rs +++ b/components/file_system/src/io_stats/biosnoop.rs @@ -14,7 +14,7 @@ use crossbeam_utils::CachePadded; use strum::{EnumCount, IntoEnumIterator}; use tikv_util::sys::thread; -use crate::{metrics::*, IOBytes, IOType}; +use crate::{metrics::*, IoBytes, IoType}; /// Biosnoop leverages BCC to make use of eBPF to get disk IO of TiKV requests. /// The BCC code is in `biosnoop.c` which is compiled and attached kernel on @@ -29,17 +29,17 @@ use crate::{metrics::*, IOBytes, IOType}; /// by address, then all the IO requests for that thread will be recorded in /// corresponding type's map in BCC. /// -/// With that information, every time calling `IOContext` it get the stored stats -/// from corresponding type's map in BCC. Thus it enables TiKV to get the latency and -/// bytes of read/write request per IO-type. +/// With that information, every time calling `IoContext` it get the stored +/// stats from corresponding type's map in BCC. Thus it enables TiKV to get the +/// latency and bytes of read/write request per IO-type. const MAX_THREAD_IDX: usize = 192; // Hold the BPF to keep it not dropped. // The two tables are `stats_by_type` and `type_by_pid` respectively. -static mut BPF_CONTEXT: Option = None; +static mut BPF_CONTEXT: Option = None; -struct BPFContext { +struct BpfContext { bpf: BPF, stats_table: Table, type_table: Table, @@ -56,9 +56,9 @@ struct BPFContext { // and kernel. Thus no need to make the elements atomic. Also use padding to // avoid false sharing. // Leave the last element as reserved, when there is no available index, all -// other threads will be allocated to that index with IOType::Other always. -static mut IO_TYPE_ARRAY: [CachePadded; MAX_THREAD_IDX + 1] = - [CachePadded::new(IOType::Other); MAX_THREAD_IDX + 1]; +// other threads will be allocated to that index with IoType::Other always. +static mut IO_TYPE_ARRAY: [CachePadded; MAX_THREAD_IDX + 1] = + [CachePadded::new(IoType::Other); MAX_THREAD_IDX + 1]; // The index of the element of IO_TYPE_ARRAY for this thread to access. thread_local! { @@ -71,7 +71,7 @@ thread_local! { &mut tid.to_ne_bytes(), std::slice::from_raw_parts_mut( ptr as *mut u8, - std::mem::size_of::<*const IOType>(), + std::mem::size_of::<*const IoType>(), ), ).unwrap(); } @@ -83,7 +83,7 @@ struct IdxWrapper(usize); impl Drop for IdxWrapper { fn drop(&mut self) { - unsafe { *IO_TYPE_ARRAY[self.0] = IOType::Other }; + unsafe { *IO_TYPE_ARRAY[self.0] = IoType::Other }; IDX_ALLOCATOR.free(self.0); // drop() of static variables won't be called when program exits. @@ -134,10 +134,10 @@ impl IdxAllocator { } } -pub fn set_io_type(new_io_type: IOType) { +pub fn set_io_type(new_io_type: IoType) { unsafe { IDX.with(|idx| { - // if MAX_THREAD_IDX, keep IOType::Other always + // if MAX_THREAD_IDX, keep IoType::Other always if idx.0 != MAX_THREAD_IDX { *IO_TYPE_ARRAY[idx.0] = new_io_type; } @@ -145,22 +145,22 @@ pub fn set_io_type(new_io_type: IOType) { }; } -pub fn get_io_type() -> IOType { +pub fn get_io_type() -> IoType { unsafe { *IDX.with(|idx| IO_TYPE_ARRAY[idx.0]) } } -pub fn fetch_io_bytes() -> [IOBytes; IOType::COUNT] { +pub fn fetch_io_bytes() -> [IoBytes; IoType::COUNT] { let mut bytes = Default::default(); unsafe { if let Some(ctx) = BPF_CONTEXT.as_mut() { - for io_type in IOType::iter() { - let io_type_buf_ptr = &mut io_type as *mut IOType as *mut u8; + for io_type in IoType::iter() { + let io_type_buf_ptr = &mut io_type as *mut IoType as *mut u8; let mut io_type_buf = - std::slice::from_raw_parts_mut(io_type_buf_ptr, std::mem::size_of::()); + std::slice::from_raw_parts_mut(io_type_buf_ptr, std::mem::size_of::()); if let Ok(e) = ctx.stats_table.get(&mut io_type_buf) { - assert!(e.len() == std::mem::size_of::()); + assert!(e.len() == std::mem::size_of::()); bytes[io_type as usize] = - std::ptr::read_unaligned(e.as_ptr() as *const IOBytes); + std::ptr::read_unaligned(e.as_ptr() as *const IoBytes); } } } @@ -210,7 +210,7 @@ pub fn init() -> Result<(), String> { let stats_table = bpf.table("stats_by_type").map_err(|e| e.to_string())?; let type_table = bpf.table("type_by_pid").map_err(|e| e.to_string())?; unsafe { - BPF_CONTEXT = Some(BPFContext { + BPF_CONTEXT = Some(BpfContext { bpf, stats_table, type_table, @@ -286,13 +286,13 @@ mod tests { fetch_io_bytes, flush_io_latency_metrics, get_io_type, init, set_io_type, BPF_CONTEXT, MAX_THREAD_IDX, }; - use crate::{metrics::*, IOType, OpenOptions}; + use crate::{metrics::*, IoType, OpenOptions}; #[test] fn test_biosnoop() { init().unwrap(); - // Test cases are running in parallel, while they depend on the same global variables. - // To make them not affect each other, run them in sequence. + // Test cases are running in parallel, while they depend on the same global + // variables. To make them not affect each other, run them in sequence. test_thread_idx_allocation(); test_io_context(); unsafe { @@ -301,8 +301,8 @@ mod tests { } fn test_io_context() { - set_io_type(IOType::Compaction); - assert_eq!(get_io_type(), IOType::Compaction); + set_io_type(IoType::Compaction); + assert_eq!(get_io_type(), IoType::Compaction); let tmp = TempDir::new().unwrap(); let file_path = tmp.path().join("test_io_context"); let mut f = OpenOptions::new() @@ -313,18 +313,18 @@ mod tests { .unwrap(); let mut w = vec![A512::default(); 2]; w.as_bytes_mut()[512] = 42; - let mut compaction_bytes_before = fetch_io_bytes()[IOType::Compaction as usize]; + let mut compaction_bytes_before = fetch_io_bytes()[IoType::Compaction as usize]; f.write(w.as_bytes()).unwrap(); f.sync_all().unwrap(); - let compaction_bytes = fetch_io_bytes()[IOType::Compaction as usize]; + let compaction_bytes = fetch_io_bytes()[IoType::Compaction as usize]; assert_ne!((compaction_bytes - compaction_bytes_before).write, 0); assert_eq!((compaction_bytes - compaction_bytes_before).read, 0); compaction_bytes_before = compaction_bytes; drop(f); - let other_bytes_before = fetch_io_bytes()[IOType::Other as usize]; + let other_bytes_before = fetch_io_bytes()[IoType::Other as usize]; std::thread::spawn(move || { - set_io_type(IOType::Other); + set_io_type(IoType::Other); let mut f = OpenOptions::new() .read(true) .custom_flags(O_DIRECT) @@ -337,8 +337,8 @@ mod tests { .join() .unwrap(); - let compaction_bytes = fetch_io_bytes()[IOType::Compaction as usize]; - let other_bytes = fetch_io_bytes()[IOType::Other as usize]; + let compaction_bytes = fetch_io_bytes()[IoType::Compaction as usize]; + let other_bytes = fetch_io_bytes()[IoType::Other as usize]; assert_eq!((compaction_bytes - compaction_bytes_before).write, 0); assert_eq!((compaction_bytes - compaction_bytes_before).read, 0); assert_eq!((other_bytes - other_bytes_before).write, 0); @@ -353,7 +353,7 @@ mod tests { // the thread indexes should be recycled. for _ in 1..=MAX_THREAD_IDX * 2 { std::thread::spawn(|| { - set_io_type(IOType::Other); + set_io_type(IoType::Other); }) .join() .unwrap(); @@ -365,7 +365,7 @@ mod tests { for _ in 1..=MAX_THREAD_IDX { let pair1 = pair.clone(); let h = std::thread::spawn(move || { - set_io_type(IOType::Compaction); + set_io_type(IoType::Compaction); let (lock, cvar) = &*pair1; let mut stop = lock.lock().unwrap(); while !*stop { @@ -375,11 +375,11 @@ mod tests { handles.push(h); } - // the reserved index is used, io type should be IOType::Other + // the reserved index is used, io type should be IoType::Other for _ in 1..=MAX_THREAD_IDX { std::thread::spawn(|| { - set_io_type(IOType::Compaction); - assert_eq!(get_io_type(), IOType::Other); + set_io_type(IoType::Compaction); + assert_eq!(get_io_type(), IoType::Other); }) .join() .unwrap(); @@ -399,8 +399,8 @@ mod tests { // the thread indexes should be available again. for _ in 1..=MAX_THREAD_IDX { std::thread::spawn(|| { - set_io_type(IOType::Compaction); - assert_eq!(get_io_type(), IOType::Compaction); + set_io_type(IoType::Compaction); + assert_eq!(get_io_type(), IoType::Compaction); }) .join() .unwrap(); @@ -439,7 +439,7 @@ mod tests { #[ignore] fn bench_flush_io_latency_metrics(b: &mut Bencher) { init().unwrap(); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); let tmp = TempDir::new().unwrap(); let file_path = tmp.path().join("bench_flush_io_latency_metrics"); @@ -476,7 +476,7 @@ mod tests { w.as_bytes_mut()[64] = 42; b.iter(|| { - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); f.write(w.as_bytes()).unwrap(); f.sync_all().unwrap(); }); @@ -509,7 +509,7 @@ mod tests { .unwrap(); let mut r = vec![A512::default(); 2]; b.iter(|| { - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); f.seek(SeekFrom::Start(rng.gen_range(0..100) * 512)) .unwrap(); assert_ne!(f.read(&mut r.as_bytes_mut()).unwrap(), 0); diff --git a/components/file_system/src/io_stats/mod.rs b/components/file_system/src/io_stats/mod.rs index f0e644ad4a4..e4c0017451f 100644 --- a/components/file_system/src/io_stats/mod.rs +++ b/components/file_system/src/io_stats/mod.rs @@ -6,27 +6,27 @@ mod stub { use strum::EnumCount; - use crate::{IOBytes, IOType}; + use crate::{IoBytes, IoType}; pub fn init() -> Result<(), String> { Err("No I/O tracing tool available".to_owned()) } thread_local! { - static IO_TYPE: Cell = Cell::new(IOType::Other); + static IO_TYPE: Cell = Cell::new(IoType::Other); } - pub fn set_io_type(new_io_type: IOType) { + pub fn set_io_type(new_io_type: IoType) { IO_TYPE.with(|io_type| { io_type.set(new_io_type); }); } - pub fn get_io_type() -> IOType { + pub fn get_io_type() -> IoType { IO_TYPE.with(|io_type| io_type.get()) } - pub fn fetch_io_bytes() -> [IOBytes; IOType::COUNT] { + pub fn fetch_io_bytes() -> [IoBytes; IoType::COUNT] { Default::default() } } @@ -45,8 +45,10 @@ pub use proc::*; #[cfg(test)] mod tests { + use tikv_util::sys::thread::StdThreadBuildWrapper; + use super::*; - use crate::IOType; + use crate::IoType; #[bench] fn bench_fetch_io_bytes(b: &mut test::Bencher) { @@ -54,8 +56,8 @@ mod tests { let _ths = (0..8) .map(|_| { let tx_clone = tx.clone(); - std::thread::Builder::new().spawn(move || { - set_io_type(IOType::ForegroundWrite); + std::thread::Builder::new().spawn_wrapper(move || { + set_io_type(IoType::ForegroundWrite); tx_clone.send(()).unwrap(); }) }) @@ -72,15 +74,15 @@ mod tests { let _ths = (0..8) .map(|_| { let tx_clone = tx.clone(); - std::thread::Builder::new().spawn(move || { - set_io_type(IOType::ForegroundWrite); + std::thread::Builder::new().spawn_wrapper(move || { + set_io_type(IoType::ForegroundWrite); tx_clone.send(()).unwrap(); }) }) .collect::>(); b.iter(|| match get_io_type() { - IOType::ForegroundWrite => set_io_type(IOType::ForegroundRead), - _ => set_io_type(IOType::ForegroundWrite), + IoType::ForegroundWrite => set_io_type(IoType::ForegroundRead), + _ => set_io_type(IoType::ForegroundWrite), }); for _ in 0..8 { rx.recv().unwrap(); diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 836b5f5fdf0..fca0f6a64b1 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -13,125 +13,112 @@ use crossbeam_utils::CachePadded; use parking_lot::Mutex; use strum::EnumCount; use thread_local::ThreadLocal; -use tikv_util::{ - sys::thread::{self, Pid}, - warn, -}; +use tikv_util::sys::thread::{self, Pid}; -use crate::{IOBytes, IOType}; +use crate::{IoBytes, IoType}; lazy_static! { /// Total I/O bytes read/written by each I/O type. - static ref GLOBAL_IO_STATS: [AtomicIOBytes; IOType::COUNT] = Default::default(); + static ref GLOBAL_IO_STATS: [AtomicIoBytes; IoType::COUNT] = Default::default(); /// Incremental I/O bytes read/written by the thread's own I/O type. - static ref LOCAL_IO_STATS: ThreadLocal>> = ThreadLocal::new(); + static ref LOCAL_IO_STATS: ThreadLocal>> = ThreadLocal::new(); } thread_local! { /// A private copy of I/O type. Optimized for local access. - static IO_TYPE: Cell = Cell::new(IOType::Other); + static IO_TYPE: Cell = Cell::new(IoType::Other); } #[derive(Debug)] -struct ThreadID { +struct ThreadId { pid: Pid, tid: Pid, proc_reader: Option>, } -impl ThreadID { - fn current() -> ThreadID { +impl ThreadId { + fn current() -> ThreadId { let pid = thread::process_id(); let tid = thread::thread_id(); - ThreadID { + ThreadId { pid, tid, proc_reader: None, } } - fn fetch_io_bytes(&mut self) -> Option { + fn fetch_io_bytes(&mut self) -> Result { if self.proc_reader.is_none() { let path = PathBuf::from("/proc") .join(format!("{}", self.pid)) .join("task") .join(format!("{}", self.tid)) .join("io"); - match File::open(path) { - Ok(file) => { - self.proc_reader = Some(BufReader::new(file)); - } - Err(e) => { - warn!("failed to open proc file: {}", e); - } - } + self.proc_reader = Some(BufReader::new( + File::open(path).map_err(|e| format!("open: {}", e))?, + )); } - if let Some(ref mut reader) = self.proc_reader { - reader - .seek(std::io::SeekFrom::Start(0)) - .map_err(|e| { - warn!("failed to seek proc file: {}", e); - }) - .ok()?; - let mut io_bytes = IOBytes::default(); - for line in reader.lines() { - let line = line - .map_err(|e| { - // ESRCH 3 No such process - if e.raw_os_error() != Some(3) { - warn!("failed to read proc file: {}", e); - } - }) - .ok()?; - if line.len() > 11 { - let mut s = line.split_whitespace(); - if let (Some(field), Some(value)) = (s.next(), s.next()) { - if field.starts_with("read_bytes") { - io_bytes.read = u64::from_str(value).ok()?; - } else if field.starts_with("write_bytes") { - io_bytes.write = u64::from_str(value).ok()?; + let reader = self.proc_reader.as_mut().unwrap(); + reader + .seek(std::io::SeekFrom::Start(0)) + .map_err(|e| format!("seek: {}", e))?; + let mut io_bytes = IoBytes::default(); + for line in reader.lines() { + match line { + Ok(line) => { + if line.len() > 11 { + let mut s = line.split_whitespace(); + if let (Some(field), Some(value)) = (s.next(), s.next()) { + if field.starts_with("read_bytes") { + io_bytes.read = u64::from_str(value) + .map_err(|e| format!("parse read_bytes: {}", e))?; + } else if field.starts_with("write_bytes") { + io_bytes.write = u64::from_str(value) + .map_err(|e| format!("parse write_bytes: {}", e))?; + } } } } + // ESRCH 3 No such process + Err(e) if e.raw_os_error() == Some(3) => break, + Err(e) => return Err(format!("read: {}", e)), } - Some(io_bytes) - } else { - None } + Ok(io_bytes) } } -struct LocalIOStats { - id: ThreadID, - io_type: IOType, - last_flushed: IOBytes, +struct LocalIoStats { + id: ThreadId, + io_type: IoType, + last_flushed: IoBytes, } -impl LocalIOStats { +impl LocalIoStats { fn current() -> Self { - LocalIOStats { - id: ThreadID::current(), - io_type: IOType::Other, - last_flushed: IOBytes::default(), + LocalIoStats { + id: ThreadId::current(), + io_type: IoType::Other, + last_flushed: IoBytes::default(), } } } #[derive(Default)] -struct AtomicIOBytes { +struct AtomicIoBytes { read: AtomicU64, write: AtomicU64, } -impl AtomicIOBytes { - fn load(&self, order: Ordering) -> IOBytes { - IOBytes { +impl AtomicIoBytes { + fn load(&self, order: Ordering) -> IoBytes { + IoBytes { read: self.read.load(order), write: self.write.load(order), } } - fn fetch_add(&self, other: IOBytes, order: Ordering) { + fn fetch_add(&self, other: IoBytes, order: Ordering) { self.read.fetch_add(other.read, order); self.write.fetch_add(other.write, order); } @@ -139,8 +126,8 @@ impl AtomicIOBytes { /// Flushes the local I/O stats to global I/O stats. #[inline] -fn flush_thread_io(sentinel: &mut LocalIOStats) { - if let Some(io_bytes) = sentinel.id.fetch_io_bytes() { +fn flush_thread_io(sentinel: &mut LocalIoStats) { + if let Ok(io_bytes) = sentinel.id.fetch_io_bytes() { GLOBAL_IO_STATS[sentinel.io_type as usize] .fetch_add(io_bytes - sentinel.last_flushed, Ordering::Relaxed); sentinel.last_flushed = io_bytes; @@ -148,14 +135,28 @@ fn flush_thread_io(sentinel: &mut LocalIOStats) { } pub fn init() -> Result<(), String> { + ThreadId::current() + .fetch_io_bytes() + .map_err(|e| format!("failed to fetch I/O bytes from proc: {}", e))?; + // Manually initialize the sentinel so that `fetch_io_bytes` doesn't miss any + // thread. + LOCAL_IO_STATS.get_or(|| CachePadded::new(Mutex::new(LocalIoStats::current()))); + tikv_util::sys::thread::hook_thread_start(Box::new(|| { + LOCAL_IO_STATS.get_or(|| CachePadded::new(Mutex::new(LocalIoStats::current()))); + })); Ok(()) } -pub fn set_io_type(new_io_type: IOType) { +/// Bind I/O type for the current thread. +/// Following calls to the [`file_system`](crate) APIs would be throttled and +/// recorded via this information. +/// Generally, when you are creating new threads playing with the local disks, +/// you should call this before doing so. +pub fn set_io_type(new_io_type: IoType) { IO_TYPE.with(|io_type| { if io_type.get() != new_io_type { let mut sentinel = LOCAL_IO_STATS - .get_or(|| CachePadded::new(Mutex::new(LocalIOStats::current()))) + .get_or(|| CachePadded::new(Mutex::new(LocalIoStats::current()))) .lock(); flush_thread_io(&mut sentinel); sentinel.io_type = new_io_type; @@ -164,16 +165,16 @@ pub fn set_io_type(new_io_type: IOType) { }); } -pub fn get_io_type() -> IOType { +pub fn get_io_type() -> IoType { IO_TYPE.with(|io_type| io_type.get()) } -pub fn fetch_io_bytes() -> [IOBytes; IOType::COUNT] { - let mut bytes: [IOBytes; IOType::COUNT] = Default::default(); +pub fn fetch_io_bytes() -> [IoBytes; IoType::COUNT] { + let mut bytes: [IoBytes; IoType::COUNT] = Default::default(); LOCAL_IO_STATS.iter().for_each(|sentinel| { flush_thread_io(&mut sentinel.lock()); }); - for i in 0..IOType::COUNT { + for i in 0..IoType::COUNT { bytes[i] = GLOBAL_IO_STATS[i].load(Ordering::Relaxed); } bytes @@ -184,21 +185,23 @@ mod tests { use std::{ io::{Read, Write}, os::unix::fs::OpenOptionsExt, + sync::mpsc, }; use libc::O_DIRECT; use maligned::{AsBytes, AsBytesMut, A512}; use tempfile::{tempdir, tempdir_in}; + use tikv_util::sys::thread::StdThreadBuildWrapper; use super::*; - use crate::{OpenOptions, WithIOType}; + use crate::{OpenOptions, WithIoType}; #[test] fn test_read_bytes() { let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); let file_path = tmp.path().join("test_read_bytes.txt"); - let mut id = ThreadID::current(); - let _type = WithIOType::new(IOType::Compaction); + let mut id = ThreadId::current(); + let _type = WithIoType::new(IoType::Compaction); { let mut f = OpenOptions::new() .write(true) @@ -229,13 +232,13 @@ mod tests { fn test_write_bytes() { let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); let file_path = tmp.path().join("test_write_bytes.txt"); - let mut id = ThreadID::current(); - let _type = WithIOType::new(IOType::Compaction); + let mut id = ThreadId::current(); + let _type = WithIoType::new(IoType::Compaction); let mut f = OpenOptions::new() .write(true) .create(true) .custom_flags(O_DIRECT) - .open(&file_path) + .open(file_path) .unwrap(); let w = vec![A512::default(); 8]; let base_local_bytes = id.fetch_io_bytes().unwrap(); @@ -248,9 +251,64 @@ mod tests { } } + #[test] + fn test_fetch_all_io_bytes() { + let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); + + init().unwrap(); + + let file_path = tmp.path().join("test_fetch_all_io_bytes_1.txt"); + let (tx1, rx1) = mpsc::sync_channel(0); + let t1 = std::thread::Builder::new() + .spawn_wrapper(move || { + set_io_type(IoType::ForegroundWrite); + let mut f = OpenOptions::new() + .write(true) + .create(true) + .custom_flags(O_DIRECT) + .open(file_path) + .unwrap(); + let w = vec![A512::default(); 8]; + f.write_all(w.as_bytes()).unwrap(); + f.sync_all().unwrap(); + tx1.send(()).unwrap(); + tx1.send(()).unwrap(); + }) + .unwrap(); + + let file_path = tmp.path().join("test_fetch_all_io_bytes_2.txt"); + let (tx2, rx2) = mpsc::sync_channel(0); + let t2 = std::thread::Builder::new() + .spawn_wrapper(move || { + let mut f = OpenOptions::new() + .write(true) + .create(true) + .custom_flags(O_DIRECT) + .open(file_path) + .unwrap(); + let w = vec![A512::default(); 8]; + f.write_all(w.as_bytes()).unwrap(); + f.sync_all().unwrap(); + tx2.send(()).unwrap(); + tx2.send(()).unwrap(); + }) + .unwrap(); + + rx1.recv().unwrap(); + rx2.recv().unwrap(); + let bytes = fetch_io_bytes(); + assert_eq!(bytes[IoType::ForegroundWrite as usize].write, 4096); + assert_eq!(bytes[IoType::Other as usize].write, 4096); + + rx1.recv().unwrap(); + rx2.recv().unwrap(); + t1.join().unwrap(); + t2.join().unwrap(); + } + #[bench] fn bench_fetch_thread_io_bytes(b: &mut test::Bencher) { - let mut id = ThreadID::current(); + let mut id = ThreadId::current(); b.iter(|| id.fetch_io_bytes().unwrap()); } } diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index dd99b810e28..91e0a35da80 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -18,10 +18,13 @@ mod metrics; mod metrics_manager; mod rate_limiter; -pub use std::fs::{ - canonicalize, create_dir, create_dir_all, hard_link, metadata, read_dir, read_link, remove_dir, - remove_dir_all, remove_file, rename, set_permissions, symlink_metadata, DirBuilder, DirEntry, - FileType, Metadata, Permissions, ReadDir, +pub use std::{ + convert::TryFrom, + fs::{ + canonicalize, create_dir, create_dir_all, hard_link, metadata, read_dir, read_link, + remove_dir, remove_dir_all, remove_file, rename, set_permissions, symlink_metadata, + DirBuilder, DirEntry, FileType, Metadata, Permissions, ReadDir, + }, }; use std::{ io::{self, ErrorKind, Read, Write}, @@ -39,21 +42,21 @@ use openssl::{ hash::{self, Hasher, MessageDigest}, }; pub use rate_limiter::{ - get_io_rate_limiter, set_io_rate_limiter, IOBudgetAdjustor, IORateLimitMode, IORateLimiter, - IORateLimiterStatistics, + get_io_rate_limiter, set_io_rate_limiter, IoBudgetAdjustor, IoRateLimitMode, IoRateLimiter, + IoRateLimiterStatistics, }; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use strum::{EnumCount, EnumIter}; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum IOOp { +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum IoOp { Read, Write, } #[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, EnumCount, EnumIter)] -pub enum IOType { +#[derive(Clone, Copy, Debug, PartialEq, Hash, EnumCount, EnumIter)] +pub enum IoType { Other = 0, // Including coprocessor and storage read. ForegroundRead = 1, @@ -69,39 +72,41 @@ pub enum IOType { Gc = 8, Import = 9, Export = 10, + RewriteLog = 11, } -impl IOType { +impl IoType { pub fn as_str(&self) -> &str { match *self { - IOType::Other => "other", - IOType::ForegroundRead => "foreground_read", - IOType::ForegroundWrite => "foreground_write", - IOType::Flush => "flush", - IOType::LevelZeroCompaction => "level_zero_compaction", - IOType::Compaction => "compaction", - IOType::Replication => "replication", - IOType::LoadBalance => "load_balance", - IOType::Gc => "gc", - IOType::Import => "import", - IOType::Export => "export", + IoType::Other => "other", + IoType::ForegroundRead => "foreground_read", + IoType::ForegroundWrite => "foreground_write", + IoType::Flush => "flush", + IoType::LevelZeroCompaction => "level_zero_compaction", + IoType::Compaction => "compaction", + IoType::Replication => "replication", + IoType::LoadBalance => "load_balance", + IoType::Gc => "gc", + IoType::Import => "import", + IoType::Export => "export", + IoType::RewriteLog => "log_rewrite", } } } -pub struct WithIOType { - previous_io_type: IOType, +pub struct WithIoType { + previous_io_type: IoType, } -impl WithIOType { - pub fn new(new_io_type: IOType) -> WithIOType { +impl WithIoType { + pub fn new(new_io_type: IoType) -> WithIoType { let previous_io_type = get_io_type(); set_io_type(new_io_type); - WithIOType { previous_io_type } + WithIoType { previous_io_type } } } -impl Drop for WithIOType { +impl Drop for WithIoType { fn drop(&mut self) { set_io_type(self.previous_io_type); } @@ -109,12 +114,12 @@ impl Drop for WithIOType { #[repr(C)] #[derive(Debug, Copy, Clone, Default)] -pub struct IOBytes { +pub struct IoBytes { read: u64, write: u64, } -impl std::ops::Sub for IOBytes { +impl std::ops::Sub for IoBytes { type Output = Self; fn sub(self, other: Self) -> Self::Output { @@ -126,40 +131,45 @@ impl std::ops::Sub for IOBytes { } #[repr(u32)] -#[derive(Debug, Clone, PartialEq, Eq, Copy, EnumCount)] -pub enum IOPriority { +#[derive(Debug, Clone, PartialEq, Copy, EnumCount)] +pub enum IoPriority { Low = 0, Medium = 1, High = 2, } -impl IOPriority { +impl IoPriority { pub fn as_str(&self) -> &str { match *self { - IOPriority::Low => "low", - IOPriority::Medium => "medium", - IOPriority::High => "high", + IoPriority::Low => "low", + IoPriority::Medium => "medium", + IoPriority::High => "high", } } - fn unsafe_from_u32(i: u32) -> Self { - unsafe { std::mem::transmute(i) } + fn from_u32(i: u32) -> Self { + match i { + 0 => IoPriority::Low, + 1 => IoPriority::Medium, + 2 => IoPriority::High, + _ => panic!("unknown io priority {}", i), + } } } -impl std::str::FromStr for IOPriority { +impl std::str::FromStr for IoPriority { type Err = String; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { match s { - "low" => Ok(IOPriority::Low), - "medium" => Ok(IOPriority::Medium), - "high" => Ok(IOPriority::High), + "low" => Ok(IoPriority::Low), + "medium" => Ok(IoPriority::Medium), + "high" => Ok(IoPriority::High), s => Err(format!("expect: low, medium or high, got: {:?}", s)), } } } -impl Serialize for IOPriority { +impl Serialize for IoPriority { fn serialize(&self, serializer: S) -> Result where S: Serializer, @@ -168,7 +178,7 @@ impl Serialize for IOPriority { } } -impl<'de> Deserialize<'de> for IOPriority { +impl<'de> Deserialize<'de> for IoPriority { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, @@ -176,17 +186,17 @@ impl<'de> Deserialize<'de> for IOPriority { use serde::de::{Error, Unexpected, Visitor}; struct StrVistor; impl<'de> Visitor<'de> for StrVistor { - type Value = IOPriority; + type Value = IoPriority; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(formatter, "a IO priority") } - fn visit_str(self, value: &str) -> Result + fn visit_str(self, value: &str) -> Result where E: Error, { - let p = match IOPriority::from_str(&*value.trim().to_lowercase()) { + let p = match IoPriority::from_str(&value.trim().to_lowercase()) { Ok(p) => p, _ => { return Err(E::invalid_value( @@ -203,21 +213,19 @@ impl<'de> Deserialize<'de> for IOPriority { } } -impl From for ConfigValue { - fn from(mode: IOPriority) -> ConfigValue { - ConfigValue::IOPriority(mode.as_str().to_owned()) +impl From for ConfigValue { + fn from(mode: IoPriority) -> ConfigValue { + ConfigValue::String(mode.as_str().to_owned()) } } -impl From for IOPriority { - fn from(c: ConfigValue) -> IOPriority { - if let ConfigValue::IOPriority(s) = c { - match IOPriority::from_str(s.as_str()) { - Ok(p) => p, - _ => panic!("expect: low, medium, high, got: {:?}", s), - } +impl TryFrom for IoPriority { + type Error = String; + fn try_from(c: ConfigValue) -> Result { + if let ConfigValue::String(s) = c { + Self::from_str(s.as_str()) } else { - panic!("expect: ConfigValue::IOPriority, got: {:?}", c); + panic!("expect: ConfigValue::String, got: {:?}", c); } } } @@ -280,7 +288,8 @@ pub fn copy, Q: AsRef>(from: P, to: Q) -> io::Result { copy_imp(from.as_ref(), to.as_ref(), false /* sync */) } -/// Copies the contents and permission bits of one file to another, then synchronizes. +/// Copies the contents and permission bits of one file to another, then +/// synchronizes. pub fn copy_and_sync, Q: AsRef>(from: P, to: Q) -> io::Result { copy_imp(from.as_ref(), to.as_ref(), true /* sync */) } @@ -295,8 +304,8 @@ pub fn file_exists>(file: P) -> bool { path.exists() && path.is_file() } -/// Deletes given path from file system. Returns `true` on success, `false` if the file doesn't exist. -/// Otherwise the raw error will be returned. +/// Deletes given path from file system. Returns `true` on success, `false` if +/// the file doesn't exist. Otherwise the raw error will be returned. pub fn delete_file_if_exist>(file: P) -> io::Result { match remove_file(&file) { Ok(_) => Ok(true), @@ -305,8 +314,8 @@ pub fn delete_file_if_exist>(file: P) -> io::Result { } } -/// Deletes given path from file system. Returns `true` on success, `false` if the directory doesn't -/// exist. Otherwise the raw error will be returned. +/// Deletes given path from file system. Returns `true` on success, `false` if +/// the directory doesn't exist. Otherwise the raw error will be returned. pub fn delete_dir_if_exist>(dir: P) -> io::Result { match remove_dir_all(&dir) { Ok(_) => Ok(true), @@ -315,8 +324,9 @@ pub fn delete_dir_if_exist>(dir: P) -> io::Result { } } -/// Creates a new, empty directory at the provided path. Returns `true` on success, -/// `false` if the directory already exists. Otherwise the raw error will be returned. +/// Creates a new, empty directory at the provided path. Returns `true` on +/// success, `false` if the directory already exists. Otherwise the raw error +/// will be returned. pub fn create_dir_if_not_exist>(dir: P) -> io::Result { match create_dir(&dir) { Ok(_) => Ok(true), @@ -423,7 +433,7 @@ pub fn reserve_space_for_recover>(data_dir: P, file_size: u64) -> delete_file_if_exist(&path)?; } fn do_reserve(dir: &Path, path: &Path, file_size: u64) -> io::Result<()> { - let f = File::create(&path)?; + let f = File::create(path)?; f.allocate(file_size)?; f.sync_all()?; sync_dir(dir) @@ -480,7 +490,7 @@ mod tests { // Ensure it works for non-existent file. let non_existent_file = dir_path.join("non_existent_file"); - assert!(get_file_size(&non_existent_file).is_err()); + get_file_size(non_existent_file).unwrap_err(); } #[test] @@ -501,7 +511,7 @@ mod tests { assert_eq!(file_exists(&existent_file), true); let non_existent_file = dir_path.join("non_existent_file"); - assert_eq!(file_exists(&non_existent_file), false); + assert_eq!(file_exists(non_existent_file), false); } #[test] @@ -522,7 +532,7 @@ mod tests { assert_eq!(file_exists(&existent_file), false); let non_existent_file = dir_path.join("non_existent_file"); - delete_file_if_exist(&non_existent_file).unwrap(); + delete_file_if_exist(non_existent_file).unwrap(); } fn gen_rand_file>(path: P, size: usize) -> u32 { diff --git a/components/file_system/src/metrics.rs b/components/file_system/src/metrics.rs index e968eaaece6..8aecc6b21c7 100644 --- a/components/file_system/src/metrics.rs +++ b/components/file_system/src/metrics.rs @@ -6,7 +6,7 @@ use prometheus::{local::*, *}; use prometheus_static_metric::*; make_static_metric! { - pub label_enum IOType { + pub label_enum IoType { other, foreground_read, foreground_write, @@ -20,29 +20,29 @@ make_static_metric! { export, } - pub label_enum IOOp { + pub label_enum IoOp { read, write, } - pub label_enum IOPriority { + pub label_enum IoPriority { low, medium, high, } - pub struct IOLatencyVec : Histogram { - "type" => IOType, - "op" => IOOp, + pub struct IoLatencyVec : Histogram { + "type" => IoType, + "op" => IoOp, } - pub struct IOBytesVec : IntCounter { - "type" => IOType, - "op" => IOOp, + pub struct IoBytesVec : IntCounter { + "type" => IoType, + "op" => IoOp, } - pub struct IOPriorityIntGaugeVec : IntGauge { - "type" => IOPriority, + pub struct IoPriorityIntGaugeVec : IntGauge { + "type" => IoPriority, } } @@ -53,9 +53,9 @@ lazy_static! { &["type", "op"] ).unwrap(); - pub static ref IO_LATENCY_MICROS_VEC: IOLatencyVec = + pub static ref IO_LATENCY_MICROS_VEC: IoLatencyVec = register_static_histogram_vec!( - IOLatencyVec, + IoLatencyVec, "tikv_io_latency_micros", "Duration of disk tikv io.", &["type", "op"], @@ -70,8 +70,8 @@ lazy_static! { ) .unwrap(); - pub static ref RATE_LIMITER_MAX_BYTES_PER_SEC: IOPriorityIntGaugeVec = register_static_int_gauge_vec!( - IOPriorityIntGaugeVec, + pub static ref RATE_LIMITER_MAX_BYTES_PER_SEC: IoPriorityIntGaugeVec = register_static_int_gauge_vec!( + IoPriorityIntGaugeVec, "tikv_rate_limiter_max_bytes_per_sec", "Maximum IO bytes per second", &["type"] diff --git a/components/file_system/src/metrics_manager.rs b/components/file_system/src/metrics_manager.rs index ddc48eb8f86..89e822b24e7 100644 --- a/components/file_system/src/metrics_manager.rs +++ b/components/file_system/src/metrics_manager.rs @@ -8,35 +8,36 @@ use tikv_util::time::Instant; use crate::{ io_stats::fetch_io_bytes, metrics::{tls_flush, IO_BYTES_VEC}, - IOBytes, IOOp, IORateLimiterStatistics, IOType, + IoBytes, IoOp, IoRateLimiterStatistics, IoType, }; pub enum BytesFetcher { - /// Fetch IO statistics from IO rate limiter, which records passed-through IOs in atomic counters. - FromRateLimiter(Arc), + /// Fetch IO statistics from IO rate limiter, which records passed-through + /// IOs in atomic counters. + FromRateLimiter(Arc), /// Fetch IO statistics from OS I/O stats collector. - FromIOStatsCollector(), + FromIoStatsCollector(), } impl BytesFetcher { - fn fetch(&self) -> [IOBytes; IOType::COUNT] { + fn fetch(&self) -> [IoBytes; IoType::COUNT] { match *self { BytesFetcher::FromRateLimiter(ref stats) => { - let mut bytes: [IOBytes; IOType::COUNT] = Default::default(); - for t in IOType::iter() { - bytes[t as usize].read = stats.fetch(t, IOOp::Read) as u64; - bytes[t as usize].write = stats.fetch(t, IOOp::Write) as u64; + let mut bytes: [IoBytes; IoType::COUNT] = Default::default(); + for t in IoType::iter() { + bytes[t as usize].read = stats.fetch(t, IoOp::Read) as u64; + bytes[t as usize].write = stats.fetch(t, IoOp::Write) as u64; } bytes } - BytesFetcher::FromIOStatsCollector() => fetch_io_bytes(), + BytesFetcher::FromIoStatsCollector() => fetch_io_bytes(), } } } pub struct MetricsManager { fetcher: BytesFetcher, - last_fetch: [IOBytes; IOType::COUNT], + last_fetch: [IoBytes; IoType::COUNT], } impl MetricsManager { @@ -50,7 +51,7 @@ impl MetricsManager { pub fn flush(&mut self, _now: Instant) { tls_flush(); let latest = self.fetcher.fetch(); - for t in IOType::iter() { + for t in IoType::iter() { let delta_bytes = latest[t as usize] - self.last_fetch[t as usize]; IO_BYTES_VEC .with_label_values(&[t.as_str(), "read"]) diff --git a/components/file_system/src/rate_limiter.rs b/components/file_system/src/rate_limiter.rs index b6aa0730ac7..79c7094b186 100644 --- a/components/file_system/src/rate_limiter.rs +++ b/components/file_system/src/rate_limiter.rs @@ -17,46 +17,46 @@ use tikv_util::time::Instant; use super::{ metrics::{tls_collect_rate_limiter_request_wait, RATE_LIMITER_MAX_BYTES_PER_SEC}, - IOOp, IOPriority, IOType, + IoOp, IoPriority, IoType, }; const DEFAULT_REFILL_PERIOD: Duration = Duration::from_millis(50); const DEFAULT_REFILLS_PER_SEC: usize = (1.0 / DEFAULT_REFILL_PERIOD.as_secs_f32()) as usize; const MAX_WAIT_DURATION_PER_REQUEST: Duration = Duration::from_millis(500); -#[derive(Debug, Clone, PartialEq, Eq, Copy)] -pub enum IORateLimitMode { +#[derive(Debug, Clone, PartialEq, Copy)] +pub enum IoRateLimitMode { WriteOnly, ReadOnly, AllIo, } -impl IORateLimitMode { +impl IoRateLimitMode { pub fn as_str(&self) -> &str { match *self { - IORateLimitMode::WriteOnly => "write-only", - IORateLimitMode::ReadOnly => "read-only", - IORateLimitMode::AllIo => "all-io", + IoRateLimitMode::WriteOnly => "write-only", + IoRateLimitMode::ReadOnly => "read-only", + IoRateLimitMode::AllIo => "all-io", } } #[inline] - pub fn contains(&self, op: IOOp) -> bool { + pub fn contains(&self, op: IoOp) -> bool { match *self { - IORateLimitMode::WriteOnly => op == IOOp::Write, - IORateLimitMode::ReadOnly => op == IOOp::Read, + IoRateLimitMode::WriteOnly => op == IoOp::Write, + IoRateLimitMode::ReadOnly => op == IoOp::Read, _ => true, } } } -impl FromStr for IORateLimitMode { +impl FromStr for IoRateLimitMode { type Err = String; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { match s { - "write-only" => Ok(IORateLimitMode::WriteOnly), - "read-only" => Ok(IORateLimitMode::ReadOnly), - "all-io" => Ok(IORateLimitMode::AllIo), + "write-only" => Ok(IoRateLimitMode::WriteOnly), + "read-only" => Ok(IoRateLimitMode::ReadOnly), + "all-io" => Ok(IoRateLimitMode::AllIo), s => Err(format!( "expect: write-only, read-only or all-io, got: {:?}", s @@ -65,7 +65,7 @@ impl FromStr for IORateLimitMode { } } -impl Serialize for IORateLimitMode { +impl Serialize for IoRateLimitMode { fn serialize(&self, serializer: S) -> Result where S: Serializer, @@ -74,7 +74,7 @@ impl Serialize for IORateLimitMode { } } -impl<'de> Deserialize<'de> for IORateLimitMode { +impl<'de> Deserialize<'de> for IoRateLimitMode { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, @@ -82,17 +82,17 @@ impl<'de> Deserialize<'de> for IORateLimitMode { use serde::de::{Error, Unexpected, Visitor}; struct StrVistor; impl<'de> Visitor<'de> for StrVistor { - type Value = IORateLimitMode; + type Value = IoRateLimitMode; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(formatter, "a IO rate limit mode") } - fn visit_str(self, value: &str) -> Result + fn visit_str(self, value: &str) -> Result where E: Error, { - let p = match IORateLimitMode::from_str(&*value.trim().to_lowercase()) { + let p = match IoRateLimitMode::from_str(&value.trim().to_lowercase()) { Ok(p) => p, _ => { return Err(E::invalid_value( @@ -112,98 +112,98 @@ impl<'de> Deserialize<'de> for IORateLimitMode { /// Record accumulated bytes through of different types. /// Used for testing and metrics. #[derive(Debug)] -pub struct IORateLimiterStatistics { - read_bytes: [CachePadded; IOType::COUNT], - write_bytes: [CachePadded; IOType::COUNT], +pub struct IoRateLimiterStatistics { + read_bytes: [CachePadded; IoType::COUNT], + write_bytes: [CachePadded; IoType::COUNT], } -impl IORateLimiterStatistics { +impl IoRateLimiterStatistics { pub fn new() -> Self { - IORateLimiterStatistics { + IoRateLimiterStatistics { read_bytes: Default::default(), write_bytes: Default::default(), } } - pub fn fetch(&self, io_type: IOType, io_op: IOOp) -> usize { + pub fn fetch(&self, io_type: IoType, io_op: IoOp) -> usize { let io_type_idx = io_type as usize; match io_op { - IOOp::Read => self.read_bytes[io_type_idx].load(Ordering::Relaxed), - IOOp::Write => self.write_bytes[io_type_idx].load(Ordering::Relaxed), + IoOp::Read => self.read_bytes[io_type_idx].load(Ordering::Relaxed), + IoOp::Write => self.write_bytes[io_type_idx].load(Ordering::Relaxed), } } - pub fn record(&self, io_type: IOType, io_op: IOOp, bytes: usize) { + pub fn record(&self, io_type: IoType, io_op: IoOp, bytes: usize) { let io_type_idx = io_type as usize; match io_op { - IOOp::Read => { + IoOp::Read => { self.read_bytes[io_type_idx].fetch_add(bytes, Ordering::Relaxed); } - IOOp::Write => { + IoOp::Write => { self.write_bytes[io_type_idx].fetch_add(bytes, Ordering::Relaxed); } } } pub fn reset(&self) { - for i in 0..IOType::COUNT { + for i in 0..IoType::COUNT { self.read_bytes[i].store(0, Ordering::Relaxed); self.write_bytes[i].store(0, Ordering::Relaxed); } } } -impl Default for IORateLimiterStatistics { +impl Default for IoRateLimiterStatistics { fn default() -> Self { Self::new() } } -/// Used to dynamically adjust the proportion of total budgets allocated for rate limited -/// IO. This is needed when global IOs are only partially rate limited, e.g. when mode is -/// IORateLimitMode::WriteOnly. -pub trait IOBudgetAdjustor: Send + Sync { +/// Used to dynamically adjust the proportion of total budgets allocated for +/// rate limited IO. This is needed when global IOs are only partially rate +/// limited, e.g. when mode is IoRateLimitMode::WriteOnly. +pub trait IoBudgetAdjustor: Send + Sync { fn adjust(&self, threshold: usize) -> usize; } -/// Limit total IO flow below provided threshold by throttling lower-priority IOs. -/// Rate limit is disabled when total IO threshold is set to zero. -struct PriorityBasedIORateLimiter { +/// Limit total IO flow below provided threshold by throttling lower-priority +/// IOs. Rate limit is disabled when total IO threshold is set to zero. +struct PriorityBasedIoRateLimiter { // High-priority IOs are only limited when strict is true strict: bool, // Total bytes passed through during current epoch - bytes_through: [CachePadded; IOPriority::COUNT], + bytes_through: [CachePadded; IoPriority::COUNT], // Maximum bytes permitted during current epoch - bytes_per_epoch: [CachePadded; IOPriority::COUNT], - protected: Mutex, + bytes_per_epoch: [CachePadded; IoPriority::COUNT], + protected: Mutex, } -struct PriorityBasedIORateLimiterProtected { +struct PriorityBasedIoRateLimiterProtected { next_refill_time: Instant, // Bytes that can't be fulfilled in current epoch - pending_bytes: [usize; IOPriority::COUNT], + pending_bytes: [usize; IoPriority::COUNT], // Adjust low priority IO flow based on system backlog - adjustor: Option>, + adjustor: Option>, } -impl PriorityBasedIORateLimiterProtected { +impl PriorityBasedIoRateLimiterProtected { fn new() -> Self { - PriorityBasedIORateLimiterProtected { + PriorityBasedIoRateLimiterProtected { next_refill_time: Instant::now_coarse() + DEFAULT_REFILL_PERIOD, - pending_bytes: [0; IOPriority::COUNT], + pending_bytes: [0; IoPriority::COUNT], adjustor: None, } } } macro_rules! do_sleep { - ($duration:expr, sync) => { + ($duration:expr,sync) => { std::thread::sleep($duration); }; - ($duration:expr, async) => { + ($duration:expr,async) => { tokio::time::sleep($duration).await; }; - ($duration:expr, skewed_sync) => { + ($duration:expr,skewed_sync) => { use rand::Rng; let mut rng = rand::thread_rng(); let subtraction: bool = rng.gen(); @@ -216,10 +216,11 @@ macro_rules! do_sleep { }; } -/// Actual implementation for requesting IOs from PriorityBasedIORateLimiter. -/// An attempt will first be recorded. If the attempted amount exceeds the available quotas of -/// current epoch, the requester will be queued (logically) and sleep until served. -/// Macro is necessary to de-dup codes used both in async/sync functions. +/// Actual implementation for requesting IOs from PriorityBasedIoRateLimiter. +/// An attempt will first be recorded. If the attempted amount exceeds the +/// available quotas of current epoch, the requester will be queued (logically) +/// and sleep until served. Macro is necessary to de-dup codes used both in +/// async/sync functions. macro_rules! request_imp { ($limiter:ident, $priority:ident, $amount:ident, $mode:tt) => {{ debug_assert!($amount > 0); @@ -234,7 +235,7 @@ macro_rules! request_imp { $limiter.bytes_through[priority_idx].fetch_add(amount, Ordering::Relaxed) + amount; // We prefer not to partially return only a portion of requested bytes. if bytes_through <= cached_bytes_per_epoch - || !$limiter.strict && $priority == IOPriority::High + || !$limiter.strict && $priority == IoPriority::High { return amount; } @@ -244,7 +245,8 @@ macro_rules! request_imp { // The request is already partially fulfilled in current epoch when consumption // overflow bytes are smaller than requested amount. let remains = std::cmp::min(bytes_through - cached_bytes_per_epoch, amount); - // When there is a recent refill, double check if bytes consumption has been reset. + // When there is a recent refill, double check if bytes consumption has been + // reset. if now + DEFAULT_REFILL_PERIOD < locked.next_refill_time + Duration::from_millis(1) && $limiter.bytes_through[priority_idx].fetch_add(remains, Ordering::Relaxed) + remains @@ -252,8 +254,8 @@ macro_rules! request_imp { { return amount; } - // Enqueue itself by adding to pending_bytes, whose current value denotes a position - // of logical queue to wait in. + // Enqueue itself by adding to pending_bytes, whose current value denotes a + // position of logical queue to wait in. locked.pending_bytes[priority_idx] += remains; // Calculate wait duration by queue_len / served_per_epoch. let wait = if locked.next_refill_time <= now { @@ -294,63 +296,65 @@ macro_rules! request_imp { }}; } -impl PriorityBasedIORateLimiter { +impl PriorityBasedIoRateLimiter { fn new(strict: bool) -> Self { - PriorityBasedIORateLimiter { + PriorityBasedIoRateLimiter { strict, bytes_through: Default::default(), bytes_per_epoch: Default::default(), - protected: Mutex::new(PriorityBasedIORateLimiterProtected::new()), + protected: Mutex::new(PriorityBasedIoRateLimiterProtected::new()), } } /// Dynamically changes the total IO flow threshold. fn set_bytes_per_sec(&self, bytes_per_sec: usize) { let now = (bytes_per_sec as f64 * DEFAULT_REFILL_PERIOD.as_secs_f64()) as usize; - let before = self.bytes_per_epoch[IOPriority::High as usize].swap(now, Ordering::Relaxed); + let before = self.bytes_per_epoch[IoPriority::High as usize].swap(now, Ordering::Relaxed); RATE_LIMITER_MAX_BYTES_PER_SEC .high .set(bytes_per_sec as i64); if now == 0 || before == 0 { // Toggle on or off rate limit. let _locked = self.protected.lock(); - self.bytes_per_epoch[IOPriority::Medium as usize].store(now, Ordering::Relaxed); + self.bytes_per_epoch[IoPriority::Medium as usize].store(now, Ordering::Relaxed); RATE_LIMITER_MAX_BYTES_PER_SEC .medium .set(bytes_per_sec as i64); - self.bytes_per_epoch[IOPriority::Low as usize].store(now, Ordering::Relaxed); + self.bytes_per_epoch[IoPriority::Low as usize].store(now, Ordering::Relaxed); RATE_LIMITER_MAX_BYTES_PER_SEC.low.set(bytes_per_sec as i64); } } - fn set_low_priority_io_adjustor(&self, adjustor: Option>) { + fn set_low_priority_io_adjustor(&self, adjustor: Option>) { let mut locked = self.protected.lock(); locked.adjustor = adjustor; } - fn request(&self, priority: IOPriority, amount: usize) -> usize { + fn request(&self, priority: IoPriority, amount: usize) -> usize { request_imp!(self, priority, amount, sync) } - async fn async_request(&self, priority: IOPriority, amount: usize) -> usize { + async fn async_request(&self, priority: IoPriority, amount: usize) -> usize { request_imp!(self, priority, amount, async) } #[cfg(test)] - fn request_with_skewed_clock(&self, priority: IOPriority, amount: usize) -> usize { + fn request_with_skewed_clock(&self, priority: IoPriority, amount: usize) -> usize { request_imp!(self, priority, amount, skewed_sync) } /// Updates and refills IO budgets for next epoch based on IO priority. /// Here we provide best-effort priority control: - /// 1) Limited IO budget is assigned to lower priority to ensure higher priority can at least - /// consume the same IO amount as the last few epochs without breaching global threshold. - /// 2) Higher priority may temporarily use lower priority's IO budgets. When this happens, - /// total IO flow could exceed global threshold. - /// 3) Highest priority IO alone must not exceed global threshold (in strict mode). - fn refill(&self, locked: &mut PriorityBasedIORateLimiterProtected, now: Instant) { + /// - Limited IO budget is assigned to lower priority to ensure higher + /// priority can at least consume the same IO amount as the last few + /// epochs without breaching global threshold. + /// - Higher priority may temporarily use lower priority's IO budgets. When + /// this happens, total IO flow could exceed global threshold. + /// - Highest priority IO alone must not exceed global threshold (in strict + /// mode). + fn refill(&self, locked: &mut PriorityBasedIoRateLimiterProtected, now: Instant) { let mut total_budgets = - self.bytes_per_epoch[IOPriority::High as usize].load(Ordering::Relaxed); + self.bytes_per_epoch[IoPriority::High as usize].load(Ordering::Relaxed); if total_budgets == 0 { // It's possible that rate limit is toggled off in the meantime. return; @@ -361,15 +365,15 @@ impl PriorityBasedIORateLimiter { locked.next_refill_time = now + DEFAULT_REFILL_PERIOD; debug_assert!( - IOPriority::High as usize == IOPriority::Medium as usize + 1 - && IOPriority::Medium as usize == IOPriority::Low as usize + 1 + IoPriority::High as usize == IoPriority::Medium as usize + 1 + && IoPriority::Medium as usize == IoPriority::Low as usize + 1 ); let mut remaining_budgets = total_budgets; let mut used_budgets = 0; - for pri in &[IOPriority::High, IOPriority::Medium] { + for pri in &[IoPriority::High, IoPriority::Medium] { let p = *pri as usize; - // Skipped epochs can only serve pending requests rather that in-coming ones, catch up - // by subtracting them from pending_bytes. + // Skipped epochs can only serve pending requests rather that in-coming ones, + // catch up by subtracting them from pending_bytes. let served_by_skipped_epochs = std::cmp::min( (remaining_budgets as f32 * skipped_epochs) as usize, locked.pending_bytes[p], @@ -386,7 +390,7 @@ impl PriorityBasedIORateLimiter { used_budgets += ((served_by_first_epoch + served_by_skipped_epochs) as f32 / (skipped_epochs + 1.0)) as usize; // Only apply rate limit adjustments on low-priority IOs. - if *pri == IOPriority::Medium { + if *pri == IoPriority::Medium { if let Some(adjustor) = &locked.adjustor { total_budgets = adjustor.adjust(total_budgets); } @@ -396,7 +400,7 @@ impl PriorityBasedIORateLimiter { } else { 1 // A small positive value so not to disable flow control. }; - if *pri == IOPriority::High { + if *pri == IoPriority::High { RATE_LIMITER_MAX_BYTES_PER_SEC .medium .set((remaining_budgets * DEFAULT_REFILLS_PER_SEC) as i64); @@ -407,7 +411,7 @@ impl PriorityBasedIORateLimiter { } self.bytes_per_epoch[p - 1].store(remaining_budgets, Ordering::Relaxed); } - let p = IOPriority::Low as usize; + let p = IoPriority::Low as usize; let to_serve_pending_bytes = std::cmp::min(locked.pending_bytes[p], remaining_budgets); locked.pending_bytes[p] -= to_serve_pending_bytes; self.bytes_through[p].store(to_serve_pending_bytes, Ordering::Relaxed); @@ -423,7 +427,7 @@ impl PriorityBasedIORateLimiter { #[cfg(test)] fn reset(&self) { let mut locked = self.protected.lock(); - for p in &[IOPriority::High, IOPriority::Medium] { + for p in &[IoPriority::High, IoPriority::Medium] { let p = *p as usize; locked.pending_bytes[p] = 0; } @@ -431,26 +435,26 @@ impl PriorityBasedIORateLimiter { } /// A high-performance IO rate limiter used for prioritized flow control. -/// An instance of `IORateLimiter` can be safely shared between threads. -pub struct IORateLimiter { - mode: IORateLimitMode, - priority_map: [CachePadded; IOType::COUNT], - throughput_limiter: Arc, - stats: Option>, +/// An instance of `IoRateLimiter` can be safely shared between threads. +pub struct IoRateLimiter { + mode: IoRateLimitMode, + priority_map: [CachePadded; IoType::COUNT], + throughput_limiter: Arc, + stats: Option>, } -impl IORateLimiter { - pub fn new(mode: IORateLimitMode, strict: bool, enable_statistics: bool) -> Self { - let priority_map: [CachePadded; IOType::COUNT] = Default::default(); +impl IoRateLimiter { + pub fn new(mode: IoRateLimitMode, strict: bool, enable_statistics: bool) -> Self { + let priority_map: [CachePadded; IoType::COUNT] = Default::default(); for p in priority_map.iter() { - p.store(IOPriority::High as u32, Ordering::Relaxed); + p.store(IoPriority::High as u32, Ordering::Relaxed); } - IORateLimiter { + IoRateLimiter { mode, priority_map, - throughput_limiter: Arc::new(PriorityBasedIORateLimiter::new(strict)), + throughput_limiter: Arc::new(PriorityBasedIoRateLimiter::new(strict)), stats: if enable_statistics { - Some(Arc::new(IORateLimiterStatistics::new())) + Some(Arc::new(IoRateLimiterStatistics::new())) } else { None }, @@ -458,14 +462,14 @@ impl IORateLimiter { } pub fn new_for_test() -> Self { - IORateLimiter::new( - IORateLimitMode::AllIo, - true, /*strict*/ - true, /*enable_statistics*/ + IoRateLimiter::new( + IoRateLimitMode::AllIo, + true, // strict + true, // enable_statistics ) } - pub fn statistics(&self) -> Option> { + pub fn statistics(&self) -> Option> { self.stats.clone() } @@ -473,15 +477,15 @@ impl IORateLimiter { self.throughput_limiter.set_bytes_per_sec(rate); } - pub fn set_io_priority(&self, io_type: IOType, io_priority: IOPriority) { + pub fn set_io_priority(&self, io_type: IoType, io_priority: IoPriority) { self.priority_map[io_type as usize].store(io_priority as u32, Ordering::Relaxed); } pub fn set_low_priority_io_adjustor_if_needed( &self, - adjustor: Option>, + adjustor: Option>, ) { - if self.mode != IORateLimitMode::AllIo { + if self.mode != IoRateLimitMode::AllIo { self.throughput_limiter .set_low_priority_io_adjustor(adjustor); } @@ -490,12 +494,10 @@ impl IORateLimiter { /// Requests for token for bytes and potentially update statistics. If this /// request can not be satisfied, the call is blocked. Granted token can be /// less than the requested bytes, but must be greater than zero. - pub fn request(&self, io_type: IOType, io_op: IOOp, mut bytes: usize) -> usize { + pub fn request(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self.throughput_limiter.request( - IOPriority::unsafe_from_u32( - self.priority_map[io_type as usize].load(Ordering::Relaxed), - ), + IoPriority::from_u32(self.priority_map[io_type as usize].load(Ordering::Relaxed)), bytes, ); } @@ -509,12 +511,12 @@ impl IORateLimiter { /// statistics. If this request can not be satisfied, the call is blocked. /// Granted token can be less than the requested bytes, but must be greater /// than zero. - pub async fn async_request(&self, io_type: IOType, io_op: IOOp, mut bytes: usize) -> usize { + pub async fn async_request(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self .throughput_limiter .async_request( - IOPriority::unsafe_from_u32( + IoPriority::from_u32( self.priority_map[io_type as usize].load(Ordering::Relaxed), ), bytes, @@ -528,12 +530,10 @@ impl IORateLimiter { } #[cfg(test)] - fn request_with_skewed_clock(&self, io_type: IOType, io_op: IOOp, mut bytes: usize) -> usize { + fn request_with_skewed_clock(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self.throughput_limiter.request_with_skewed_clock( - IOPriority::unsafe_from_u32( - self.priority_map[io_type as usize].load(Ordering::Relaxed), - ), + IoPriority::from_u32(self.priority_map[io_type as usize].load(Ordering::Relaxed)), bytes, ); } @@ -545,15 +545,15 @@ impl IORateLimiter { } lazy_static! { - static ref IO_RATE_LIMITER: Mutex>> = Mutex::new(None); + static ref IO_RATE_LIMITER: Mutex>> = Mutex::new(None); } // Do NOT use this method in test environment. -pub fn set_io_rate_limiter(limiter: Option>) { +pub fn set_io_rate_limiter(limiter: Option>) { *IO_RATE_LIMITER.lock() = limiter; } -pub fn get_io_rate_limiter() -> Option> { +pub fn get_io_rate_limiter() -> Option> { (*IO_RATE_LIMITER.lock()).clone() } @@ -565,8 +565,8 @@ mod tests { macro_rules! approximate_eq { ($left:expr, $right:expr) => { - assert!(($left) >= ($right) * 0.85); - assert!(($right) >= ($left) * 0.85); + assert!(($left) >= ($right) * 0.75); + assert!(($right) >= ($left) * 0.75); }; } @@ -587,10 +587,10 @@ mod tests { } #[derive(Debug, Clone, Copy)] - struct Request(IOType, IOOp, usize); + struct Request(IoType, IoOp, usize); fn start_background_jobs( - limiter: &Arc, + limiter: &Arc, job_count: usize, request: Request, interval: Option, @@ -620,8 +620,8 @@ mod tests { #[test] fn test_rate_limit_toggle() { let bytes_per_sec = 2000; - let limiter = IORateLimiter::new_for_test(); - limiter.set_io_priority(IOType::Compaction, IOPriority::Low); + let limiter = IoRateLimiter::new_for_test(); + limiter.set_io_priority(IoType::Compaction, IoPriority::Low); let limiter = Arc::new(limiter); let stats = limiter.statistics().unwrap(); // enable rate limit @@ -629,20 +629,20 @@ mod tests { let t0 = Instant::now(); let _write_context = start_background_jobs( &limiter, - 1, /*job_count*/ - Request(IOType::ForegroundWrite, IOOp::Write, 10), - None, /*interval*/ + 1, // job_count + Request(IoType::ForegroundWrite, IoOp::Write, 10), + None, // interval ); let _compaction_context = start_background_jobs( &limiter, - 1, /*job_count*/ - Request(IOType::Compaction, IOOp::Write, 10), - None, /*interval*/ + 1, // job_count + Request(IoType::Compaction, IoOp::Write, 10), + None, // interval ); std::thread::sleep(Duration::from_secs(1)); let t1 = Instant::now(); approximate_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64, + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64, bytes_per_sec as f64 * (t1 - t0).as_secs_f64() ); // disable rate limit @@ -651,11 +651,11 @@ mod tests { std::thread::sleep(Duration::from_secs(1)); let t2 = Instant::now(); assert!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64 + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64 > bytes_per_sec as f64 * (t2 - t1).as_secs_f64() * 4.0 ); assert!( - stats.fetch(IOType::Compaction, IOOp::Write) as f64 + stats.fetch(IoType::Compaction, IoOp::Write) as f64 > bytes_per_sec as f64 * (t2 - t1).as_secs_f64() * 4.0 ); // enable rate limit @@ -664,12 +664,12 @@ mod tests { std::thread::sleep(Duration::from_secs(1)); let t3 = Instant::now(); approximate_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64, + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64, bytes_per_sec as f64 * (t3 - t2).as_secs_f64() ); } - fn verify_rate_limit(limiter: &Arc, bytes_per_sec: usize, duration: Duration) { + fn verify_rate_limit(limiter: &Arc, bytes_per_sec: usize, duration: Duration) { let stats = limiter.statistics().unwrap(); limiter.set_io_rate_limit(bytes_per_sec); stats.reset(); @@ -679,9 +679,9 @@ mod tests { { let _context = start_background_jobs( limiter, - 2, /*job_count*/ - Request(IOType::ForegroundWrite, IOOp::Write, 10), - None, /*interval*/ + 2, // job_count + Request(IoType::ForegroundWrite, IoOp::Write, 10), + None, // interval ); std::thread::sleep(duration); } @@ -689,7 +689,7 @@ mod tests { end.duration_since(begin) }; approximate_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64, + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64, bytes_per_sec as f64 * actual_duration.as_secs_f64() ); } @@ -697,14 +697,14 @@ mod tests { #[test] fn test_rate_limit_dynamic_priority() { let bytes_per_sec = 2000; - let limiter = Arc::new(IORateLimiter::new( - IORateLimitMode::AllIo, - false, /*strict*/ - true, /*enable_statistics*/ + let limiter = Arc::new(IoRateLimiter::new( + IoRateLimitMode::AllIo, + false, // strict + true, // enable_statistics )); - limiter.set_io_priority(IOType::ForegroundWrite, IOPriority::Medium); + limiter.set_io_priority(IoType::ForegroundWrite, IoPriority::Medium); verify_rate_limit(&limiter, bytes_per_sec, Duration::from_secs(2)); - limiter.set_io_priority(IOType::ForegroundWrite, IOPriority::High); + limiter.set_io_priority(IoType::ForegroundWrite, IoPriority::High); let stats = limiter.statistics().unwrap(); stats.reset(); let duration = { @@ -712,9 +712,9 @@ mod tests { { let _context = start_background_jobs( &limiter, - 2, /*job_count*/ - Request(IOType::ForegroundWrite, IOOp::Write, 10), - None, /*interval*/ + 2, // job_count + Request(IoType::ForegroundWrite, IoOp::Write, 10), + None, // interval ); std::thread::sleep(Duration::from_secs(2)); } @@ -722,7 +722,7 @@ mod tests { end.duration_since(begin) }; assert!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64 + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64 > bytes_per_sec as f64 * duration.as_secs_f64() * 1.5 ); } @@ -731,7 +731,7 @@ mod tests { fn test_rate_limited_heavy_flow() { let low_bytes_per_sec = 2000; let high_bytes_per_sec = 10000; - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); verify_rate_limit(&limiter, low_bytes_per_sec, Duration::from_secs(2)); verify_rate_limit(&limiter, high_bytes_per_sec, Duration::from_secs(2)); verify_rate_limit(&limiter, low_bytes_per_sec, Duration::from_secs(2)); @@ -741,7 +741,7 @@ mod tests { fn test_rate_limited_light_flow() { let kbytes_per_sec = 3; let actual_kbytes_per_sec = 2; - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); limiter.set_io_rate_limit(kbytes_per_sec * 1000); let stats = limiter.statistics().unwrap(); let duration = { @@ -750,8 +750,8 @@ mod tests { // each thread request at most 1000 bytes per second let _context = start_background_jobs( &limiter, - actual_kbytes_per_sec, /*job_count*/ - Request(IOType::Compaction, IOOp::Write, 1), + actual_kbytes_per_sec, // job_count + Request(IoType::Compaction, IoOp::Write, 1), Some(Duration::from_millis(1)), ); std::thread::sleep(Duration::from_secs(2)); @@ -760,7 +760,7 @@ mod tests { end.duration_since(begin) }; approximate_eq!( - stats.fetch(IOType::Compaction, IOOp::Write) as f64, + stats.fetch(IoType::Compaction, IoOp::Write) as f64, actual_kbytes_per_sec as f64 * duration.as_secs_f64() * 1000.0 ); } @@ -771,40 +771,40 @@ mod tests { let write_work = 50; let compaction_work = 80; let import_work = 50; - let limiter = IORateLimiter::new_for_test(); + let limiter = IoRateLimiter::new_for_test(); limiter.set_io_rate_limit(bytes_per_sec); - limiter.set_io_priority(IOType::Compaction, IOPriority::Medium); - limiter.set_io_priority(IOType::Import, IOPriority::Low); + limiter.set_io_priority(IoType::Compaction, IoPriority::Medium); + limiter.set_io_priority(IoType::Import, IoPriority::Low); let stats = limiter.statistics().unwrap(); let limiter = Arc::new(limiter); let begin = Instant::now(); { let _write = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request( - IOType::ForegroundWrite, - IOOp::Write, + IoType::ForegroundWrite, + IoOp::Write, write_work * bytes_per_sec / 100 / 1000, ), Some(Duration::from_millis(1)), ); let _compaction = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request( - IOType::Compaction, - IOOp::Write, + IoType::Compaction, + IoOp::Write, compaction_work * bytes_per_sec / 100 / 1000, ), Some(Duration::from_millis(1)), ); let _import = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request( - IOType::Import, - IOOp::Write, + IoType::Import, + IoOp::Write, import_work * bytes_per_sec / 100 / 1000, ), Some(Duration::from_millis(1)), @@ -813,20 +813,20 @@ mod tests { } let end = Instant::now(); let duration = end.duration_since(begin); - let write_bytes = stats.fetch(IOType::ForegroundWrite, IOOp::Write); + let write_bytes = stats.fetch(IoType::ForegroundWrite, IoOp::Write); approximate_eq!( write_bytes as f64, (write_work * bytes_per_sec / 100) as f64 * duration.as_secs_f64() ); - let compaction_bytes = stats.fetch(IOType::Compaction, IOOp::Write); - let import_bytes = stats.fetch(IOType::Import, IOOp::Write); + let compaction_bytes = stats.fetch(IoType::Compaction, IoOp::Write); + let import_bytes = stats.fetch(IoType::Import, IoOp::Write); let total_bytes = write_bytes + import_bytes + compaction_bytes; approximate_eq!((compaction_bytes + write_bytes) as f64, total_bytes as f64); } #[bench] fn bench_critical_section(b: &mut test::Bencher) { - let inner_limiter = PriorityBasedIORateLimiter::new(true /*strict*/); + let inner_limiter = PriorityBasedIoRateLimiter::new(true /* strict */); inner_limiter.set_bytes_per_sec(1024); let now = Instant::now_coarse(); b.iter(|| { diff --git a/components/into_other/Cargo.toml b/components/into_other/Cargo.toml index be278cdc764..d31f04f4e12 100644 --- a/components/into_other/Cargo.toml +++ b/components/into_other/Cargo.toml @@ -5,6 +5,6 @@ edition = "2018" publish = false [dependencies] -engine_traits = { path = "../engine_traits", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +engine_traits = { workspace = true } +kvproto = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index de1a7089ce4..b5a6412d00a 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -6,11 +6,11 @@ publish = false [dependencies] byteorder = "1.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } +kvproto = { workspace = true } +log_wrappers = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } diff --git a/components/keys/src/lib.rs b/components/keys/src/lib.rs index a403b939727..304e13f1e66 100644 --- a/components/keys/src/lib.rs +++ b/components/keys/src/lib.rs @@ -33,6 +33,7 @@ pub const DATA_MAX_KEY: &[u8] = &[DATA_PREFIX + 1]; // Following keys are all local keys, so the first byte must be 0x01. pub const STORE_IDENT_KEY: &[u8] = &[LOCAL_PREFIX, 0x01]; pub const PREPARE_BOOTSTRAP_KEY: &[u8] = &[LOCAL_PREFIX, 0x02]; +pub const RECOVER_STATE_KEY: &[u8] = &[LOCAL_PREFIX, 0x03]; // We save two types region data in DB, for raft and other meta data. // When the store starts, we should iterate all region meta data to // construct peer, no need to travel large raft data, so we separate them @@ -226,26 +227,26 @@ pub fn origin_key(key: &[u8]) -> &[u8] { /// Get the `start_key` of current region in encoded form. pub fn enc_start_key(region: &Region) -> Vec { - // only initialized region's start_key can be encoded, otherwise there must be bugs - // somewhere. + // only initialized region's start_key can be encoded, otherwise there must be + // bugs somewhere. assert!(!region.get_peers().is_empty()); data_key(region.get_start_key()) } /// Get the `end_key` of current region in encoded form. pub fn enc_end_key(region: &Region) -> Vec { - // only initialized region's end_key can be encoded, otherwise there must be bugs - // somewhere. + // only initialized region's end_key can be encoded, otherwise there must be + // bugs somewhere. assert!(!region.get_peers().is_empty()); data_end_key(region.get_end_key()) } #[inline] -pub fn data_end_key(region_end_key: &[u8]) -> Vec { - if region_end_key.is_empty() { +pub fn data_end_key(key: &[u8]) -> Vec { + if key.is_empty() { DATA_MAX_KEY.to_vec() } else { - data_key(region_end_key) + data_key(key) } } @@ -415,17 +416,17 @@ mod tests { let state_key = raft_state_key(1); // invalid length - assert!(decode_raft_log_key(&state_key).is_err()); + decode_raft_log_key(&state_key).unwrap_err(); let mut state_key = state_key.to_vec(); state_key.write_u64::(2).unwrap(); // invalid suffix - assert!(decode_raft_log_key(&state_key).is_err()); + decode_raft_log_key(&state_key).unwrap_err(); let mut region_state_key = region_state_key(1).to_vec(); region_state_key.write_u64::(2).unwrap(); // invalid prefix - assert!(decode_raft_log_key(®ion_state_key).is_err()); + decode_raft_log_key(®ion_state_key).unwrap_err(); } #[test] @@ -439,9 +440,10 @@ mod tests { assert_eq!(buffer, data_key(b"cde")); let mut region = Region::default(); - // uninitialised region should not be passed in `enc_start_key` and `enc_end_key`. - assert!(::panic_hook::recover_safe(|| enc_start_key(®ion)).is_err()); - assert!(::panic_hook::recover_safe(|| enc_end_key(®ion)).is_err()); + // uninitialised region should not be passed in `enc_start_key` and + // `enc_end_key`. + ::panic_hook::recover_safe(|| enc_start_key(®ion)).unwrap_err(); + ::panic_hook::recover_safe(|| enc_end_key(®ion)).unwrap_err(); region.mut_peers().push(Peer::default()); assert_eq!(enc_start_key(®ion), vec![DATA_PREFIX]); diff --git a/components/keys/src/rewrite.rs b/components/keys/src/rewrite.rs index 03b6ea27c4f..68541bb50e0 100644 --- a/components/keys/src/rewrite.rs +++ b/components/keys/src/rewrite.rs @@ -6,11 +6,21 @@ use std::ops::Bound::{self, *}; +use tikv_util::codec::bytes::encode_bytes; + /// An error indicating the key cannot be rewritten because it does not start /// with the given prefix. -#[derive(PartialEq, Eq, Debug, Clone)] +#[derive(PartialEq, Debug, Clone)] pub struct WrongPrefix; +pub fn encode_bound(bound: Bound>) -> Bound> { + match bound { + Included(k) => Included(encode_bytes(&k)), + Excluded(k) => Excluded(encode_bytes(&k)), + Unbounded => Unbounded, + } +} + /// Rewrites the prefix of a byte array. pub fn rewrite_prefix( old_prefix: &[u8], diff --git a/components/log_wrappers/Cargo.toml b/components/log_wrappers/Cargo.toml index e8e9a3cc52f..4c9e62b6876 100644 --- a/components/log_wrappers/Cargo.toml +++ b/components/log_wrappers/Cargo.toml @@ -9,4 +9,4 @@ hex = "0.4" protobuf = { version = "2.8", features = ["bytes"] } slog = "2.3" slog-term = "2.4" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/log_wrappers/src/lib.rs b/components/log_wrappers/src/lib.rs index 986c1710137..5361eaeee18 100644 --- a/components/log_wrappers/src/lib.rs +++ b/components/log_wrappers/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! Provides wrappers for types that comes from 3rd-party and does not implement slog::Value. +//! Provides wrappers for types that comes from 3rd-party and does not implement +//! slog::Value. #[macro_use] extern crate slog; @@ -21,10 +22,11 @@ pub mod test_util; /// Wraps any `Display` type, use `Display` as `slog::Value`. /// -/// Usually this wrapper is useful in containers, e.g. `Option>`. +/// Usually this wrapper is useful in containers, e.g. +/// `Option>`. /// -/// If your type `val: T` is directly used as a field value, you may use `"key" => %value` syntax -/// instead. +/// If your type `val: T` is directly used as a field value, you may use `"key" +/// => %value` syntax instead. pub struct DisplayValue(pub T); impl slog::Value for DisplayValue { @@ -43,8 +45,8 @@ impl slog::Value for DisplayValue { /// /// Usually this wrapper is useful in containers, e.g. `Option>`. /// -/// If your type `val: T` is directly used as a field value, you may use `"key" => ?value` syntax -/// instead. +/// If your type `val: T` is directly used as a field value, you may use `"key" +/// => ?value` syntax instead. pub struct DebugValue(pub T); impl slog::Value for DebugValue { diff --git a/components/log_wrappers/src/test_util.rs b/components/log_wrappers/src/test_util.rs index a527ac379eb..d455e52c620 100644 --- a/components/log_wrappers/src/test_util.rs +++ b/components/log_wrappers/src/test_util.rs @@ -4,7 +4,8 @@ use std::{io, sync}; -/// A buffer which can be served as a logging destination while being able to access its content. +/// A buffer which can be served as a logging destination while being able to +/// access its content. #[derive(Clone, Default)] pub struct SyncLoggerBuffer(sync::Arc>>); @@ -14,8 +15,8 @@ impl SyncLoggerBuffer { Self::default() } - /// Builds a `slog::Logger` over this buffer which uses compact format and always output `TIME` - /// in the time field. + /// Builds a `slog::Logger` over this buffer which uses compact format and + /// always output `TIME` in the time field. pub fn build_logger(&self) -> slog::Logger { use slog::Drain; diff --git a/components/match_template/src/lib.rs b/components/match_template/src/lib.rs deleted file mode 100644 index eb50d333379..00000000000 --- a/components/match_template/src/lib.rs +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. - -#[macro_use] -extern crate quote; - -use proc_macro2::{Group, TokenStream, TokenTree}; -use quote::ToTokens; -use syn::{ - parse::{Parse, ParseStream, Result}, - punctuated::Punctuated, - *, -}; - -/// This crate provides a macro that can be used to append a match expression with multiple -/// arms, where the tokens in the first arm, as a template, can be subsitituted and the template -/// arm will be expanded into multiple arms. -/// -/// For example, the following code -/// -/// ```ignore -/// match_template! { -/// T = [Int, Real, Double], -/// match Foo { -/// EvalType::T => { panic!("{}", EvalType::T); }, -/// EvalType::Other => unreachable!(), -/// } -/// } -/// ``` -/// -/// generates -/// -/// ```ignore -/// match Foo { -/// EvalType::Int => { panic!("{}", EvalType::Int); }, -/// EvalType::Real => { panic!("{}", EvalType::Real); }, -/// EvalType::Double => { panic!("{}", EvalType::Double); }, -/// EvalType::Other => unreachable!(), -/// } -/// ``` -/// -/// In addition, substitution can vary on two sides of the arms. -/// -/// For example, -/// -/// ```ignore -/// match_template! { -/// T = [Foo, Bar => Baz], -/// match Foo { -/// EvalType::T => { panic!("{}", EvalType::T); }, -/// } -/// } -/// ``` -/// -/// generates -/// -/// ```ignore -/// match Foo { -/// EvalType::Foo => { panic!("{}", EvalType::Foo); }, -/// EvalType::Bar => { panic!("{}", EvalType::Baz); }, -/// } -/// ``` -/// -/// Wildcard match arm is also supported (but there will be no substitution). -#[proc_macro] -pub fn match_template(input: proc_macro::TokenStream) -> proc_macro::TokenStream { - let mt = parse_macro_input!(input as MatchTemplate); - mt.expand().into() -} -struct MatchTemplate { - template_ident: Ident, - substitutes: Punctuated, - match_exp: Box, - template_arm: Arm, - remaining_arms: Vec, -} - -impl Parse for MatchTemplate { - fn parse(input: ParseStream<'_>) -> Result { - let template_ident = input.parse()?; - input.parse::()?; - let substitutes_tokens; - bracketed!(substitutes_tokens in input); - let substitutes = - Punctuated::::parse_terminated(&substitutes_tokens)?; - input.parse::()?; - let m: ExprMatch = input.parse()?; - let mut arms = m.arms; - arms.iter_mut().for_each(|arm| arm.comma = None); - assert!(!arms.is_empty(), "Expect at least 1 match arm"); - let template_arm = arms.remove(0); - assert!(template_arm.guard.is_none(), "Expect no match arm guard"); - - Ok(Self { - template_ident, - substitutes, - match_exp: m.expr, - template_arm, - remaining_arms: arms, - }) - } -} - -impl MatchTemplate { - fn expand(self) -> TokenStream { - let Self { - template_ident, - substitutes, - match_exp, - template_arm, - remaining_arms, - } = self; - let match_arms = substitutes.into_iter().map(|substitute| { - let mut arm = template_arm.clone(); - let (left_tokens, right_tokens) = match substitute { - Substitution::Identical(ident) => { - (ident.clone().into_token_stream(), ident.into_token_stream()) - } - Substitution::Map(left_ident, right_tokens) => { - (left_ident.into_token_stream(), right_tokens) - } - }; - arm.pat = replace_in_token_stream(arm.pat, &template_ident, &left_tokens); - arm.body = replace_in_token_stream(arm.body, &template_ident, &right_tokens); - arm - }); - quote! { - match #match_exp { - #(#match_arms,)* - #(#remaining_arms,)* - } - } - } -} - -#[derive(Debug)] -enum Substitution { - Identical(Ident), - Map(Ident, TokenStream), -} - -impl Parse for Substitution { - fn parse(input: ParseStream<'_>) -> Result { - let left_ident = input.parse()?; - let fat_arrow: Option]> = input.parse()?; - if fat_arrow.is_some() { - let mut right_tokens: Vec = vec![]; - while !input.peek(Token![,]) && !input.is_empty() { - right_tokens.push(input.parse()?); - } - Ok(Substitution::Map( - left_ident, - right_tokens.into_iter().collect(), - )) - } else { - Ok(Substitution::Identical(left_ident)) - } - } -} - -fn replace_in_token_stream( - input: T, - from_ident: &Ident, - to_tokens: &TokenStream, -) -> T { - let mut tokens = TokenStream::new(); - input.to_tokens(&mut tokens); - - let tokens: TokenStream = tokens - .into_iter() - .flat_map(|token| match token { - TokenTree::Ident(ident) if ident == *from_ident => to_tokens.clone(), - TokenTree::Group(group) => Group::new( - group.delimiter(), - replace_in_token_stream(group.stream(), from_ident, to_tokens), - ) - .into_token_stream(), - other => other.into(), - }) - .collect(); - - syn::parse2(tokens).unwrap() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_basic() { - let input = r#" - T = [Int, Real, Double], - match foo() { - EvalType::T => { panic!("{}", EvalType::T); }, - EvalType::Other => unreachable!(), - } - "#; - - let expect_output = r#" - match foo() { - EvalType::Int => { panic!("{}", EvalType::Int); }, - EvalType::Real => { panic!("{}", EvalType::Real); }, - EvalType::Double => { panic!("{}", EvalType::Double); }, - EvalType::Other => unreachable!(), - } - "#; - let expect_output_stream: TokenStream = expect_output.parse().unwrap(); - - let mt: MatchTemplate = syn::parse_str(input).unwrap(); - let output = mt.expand(); - assert_eq!(output.to_string(), expect_output_stream.to_string()); - } - - #[test] - fn test_wildcard() { - let input = r#" - TT = [Foo, Bar], - match v { - VectorValue::TT => EvalType::TT, - _ => unreachable!(), - } - "#; - - let expect_output = r#" - match v { - VectorValue::Foo => EvalType::Foo, - VectorValue::Bar => EvalType::Bar, - _ => unreachable!(), - } - "#; - let expect_output_stream: TokenStream = expect_output.parse().unwrap(); - - let mt: MatchTemplate = syn::parse_str(input).unwrap(); - let output = mt.expand(); - assert_eq!(output.to_string(), expect_output_stream.to_string()); - } - - #[test] - fn test_map() { - let input = r#" - TT = [Foo, Bar => Baz, Bark => <&'static Whooh>()], - match v { - VectorValue::TT => EvalType::TT, - EvalType::Other => unreachable!(), - } - "#; - - let expect_output = r#" - match v { - VectorValue::Foo => EvalType::Foo, - VectorValue::Bar => EvalType::Baz, - VectorValue::Bark => EvalType:: < & 'static Whooh>(), - EvalType::Other => unreachable!(), - } - "#; - let expect_output_stream: TokenStream = expect_output.parse().unwrap(); - - let mt: MatchTemplate = syn::parse_str(input).unwrap(); - let output = mt.expand(); - assert_eq!(output.to_string(), expect_output_stream.to_string()); - } -} diff --git a/components/online_config/online_config_derive/src/lib.rs b/components/online_config/online_config_derive/src/lib.rs index 0981668d817..5518aa0e5e6 100644 --- a/components/online_config/online_config_derive/src/lib.rs +++ b/components/online_config/online_config_derive/src/lib.rs @@ -123,11 +123,7 @@ fn encoder( } }; // Only reserve attributes that related to `serde` - field.attrs = field - .attrs - .into_iter() - .filter(|f| is_attr("serde", f)) - .collect(); + field.attrs.retain(|f| is_attr("serde", f)); serialize_fields.push(field); } // Only reserve attributes that related to `serde` @@ -172,7 +168,7 @@ fn update(fields: &Punctuated, crate_name: &Ident) -> Result, crate_name: &Ident) -> Result std::result::Result<(), Box> { #(#update_fields)* + Ok(()) } }) } diff --git a/components/online_config/src/lib.rs b/components/online_config/src/lib.rs index 51f1580cafd..18d9cc0fd71 100644 --- a/components/online_config/src/lib.rs +++ b/components/online_config/src/lib.rs @@ -20,8 +20,6 @@ pub enum ConfigValue { Usize(usize), Bool(bool), String(String), - BlobRunMode(String), - IOPriority(String), Module(ConfigChange), Skip, None, @@ -39,8 +37,6 @@ impl Display for ConfigValue { ConfigValue::Usize(v) => write!(f, "{}", v), ConfigValue::Bool(v) => write!(f, "{}", v), ConfigValue::String(v) => write!(f, "{}", v), - ConfigValue::BlobRunMode(v) => write!(f, "{}", v), - ConfigValue::IOPriority(v) => write!(f, "{}", v), ConfigValue::Module(v) => write!(f, "{:?}", v), ConfigValue::Skip => write!(f, "ConfigValue::Skip"), ConfigValue::None => write!(f, ""), @@ -55,7 +51,7 @@ impl Debug for ConfigValue { } macro_rules! impl_from { - ($from: ty, $to: tt) => { + ($from:ty, $to:tt) => { impl From<$from> for ConfigValue { fn from(r: $from) -> ConfigValue { ConfigValue::$to(r) @@ -73,7 +69,7 @@ impl_from!(String, String); impl_from!(ConfigChange, Module); macro_rules! impl_into { - ($into: ty, $from: tt) => { + ($into:ty, $from:tt) => { impl From for $into { fn from(c: ConfigValue) -> $into { if let ConfigValue::$from(v) = c { @@ -115,13 +111,13 @@ impl_into!(ConfigChange, Module); /// 3. `#[online_config(submodule)]` field, these fields represent the /// submodule, and should also derive `OnlineConfig` /// 4. normal fields, the type of these fields should be implment -/// `Into` and `From` for `ConfigValue` +/// `Into` and `From`/`TryFrom` for `ConfigValue` pub trait OnlineConfig<'a> { type Encoder: serde::Serialize; /// Compare to other config, return the difference fn diff(&self, _: &Self) -> ConfigChange; /// Update config with difference returned by `diff` - fn update(&mut self, _: ConfigChange); + fn update(&mut self, _: ConfigChange) -> Result<()>; /// Get encoder that can be serialize with `serde::Serializer` /// with the disappear of `#[online_config(hidden)]` field fn get_encoder(&'a self) -> Self::Encoder; @@ -137,6 +133,10 @@ pub trait ConfigManager: Send + Sync { #[cfg(test)] mod tests { + use std::convert::TryFrom; + + use serde::Serialize; + use super::*; use crate as online_config; @@ -194,7 +194,7 @@ mod tests { assert_eq!(sub_diff.remove("field1").map(Into::into), Some(1000u64)); assert_eq!(sub_diff.remove("field2").map(Into::into), Some(true)); } - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!(cfg, updated_cfg, "cfg should be updated"); } @@ -204,7 +204,7 @@ mod tests { let diff = cfg.diff(&cfg.clone()); assert!(diff.is_empty(), "diff should be empty"); - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!(cfg, TestConfig::default(), "cfg should not be updated"); } @@ -218,7 +218,7 @@ mod tests { let mut diff = HashMap::new(); diff.insert("skip_field".to_owned(), ConfigValue::U64(123)); - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!(cfg, TestConfig::default(), "cfg should not be updated"); } @@ -241,7 +241,7 @@ mod tests { assert_eq!(sub_diff.remove("field2").map(Into::into), Some(true)); } - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!( cfg.submodule_field, updated_cfg.submodule_field, "submodule should be updated" @@ -295,4 +295,75 @@ mod tests { "skip-field = \"\"\n\n[submodule-field]\nrename_field = false\n" ); } + + #[derive(Clone, Copy, Debug, PartialEq, Serialize)] + pub enum TestEnum { + First, + Second, + } + + impl std::fmt::Display for TestEnum { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::First => f.write_str("first"), + Self::Second => f.write_str("second"), + } + } + } + + impl From for ConfigValue { + fn from(v: TestEnum) -> ConfigValue { + ConfigValue::String(format!("{}", v)) + } + } + + impl TryFrom for TestEnum { + type Error = String; + fn try_from(v: ConfigValue) -> std::result::Result { + if let ConfigValue::String(s) = v { + match s.as_str() { + "first" => Ok(Self::First), + "second" => Ok(Self::Second), + s => Err(format!("invalid config value: {}", s)), + } + } else { + panic!("expect ConfigValue::String, got: {:?}", v); + } + } + } + + #[derive(Clone, OnlineConfig, Debug, PartialEq)] + pub struct TestEnumConfig { + f1: u64, + e: TestEnum, + } + + impl Default for TestEnumConfig { + fn default() -> Self { + Self { + f1: 0, + e: TestEnum::First, + } + } + } + + #[test] + fn test_update_enum_config() { + let mut config = TestEnumConfig::default(); + + let mut diff = HashMap::new(); + diff.insert("f1".to_owned(), ConfigValue::U64(1)); + diff.insert("e".to_owned(), ConfigValue::String("second".into())); + config.update(diff).unwrap(); + + let updated = TestEnumConfig { + f1: 1, + e: TestEnum::Second, + }; + assert_eq!(config, updated); + + let mut diff = HashMap::new(); + diff.insert("e".to_owned(), ConfigValue::String("invalid".into())); + config.update(diff).unwrap_err(); + } } diff --git a/components/panic_hook/src/lib.rs b/components/panic_hook/src/lib.rs index 12db221dbb5..7e95ea4071a 100644 --- a/components/panic_hook/src/lib.rs +++ b/components/panic_hook/src/lib.rs @@ -55,7 +55,8 @@ fn track_hook(p: &PanicInfo<'_>) { /// Recover from closure which may panic. /// -/// This function assumes the closure is able to be forced to implement `UnwindSafe`. +/// This function assumes the closure is able to be forced to implement +/// `UnwindSafe`. /// /// Also see [`AssertUnwindSafe`](https://doc.rust-lang.org/std/panic/struct.AssertUnwindSafe.html). pub fn recover_safe(f: F) -> std::thread::Result diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index 44f09485705..976ad90432a 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -6,28 +6,30 @@ publish = false [features] failpoints = ["fail/failpoints"] +testexport = [] [dependencies] -collections = { path = "../collections" } -error_code = { path = "../error_code", default-features = false } +collections = { workspace = true } +error_code = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } -security = { path = "../security", default-features = false } +prometheus-static-metric = "0.5" +security = { workspace = true } semver = "0.10" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -txn_types = { path = "../txn_types", default-features = false } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +tokio-timer = { workspace = true } +txn_types = { workspace = true } +yatp = { workspace = true } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index facf2e24b76..36f7aaa983b 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -1,7 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, fmt, sync::{ atomic::{AtomicU64, Ordering}, @@ -15,36 +14,40 @@ use futures::{ channel::mpsc, compat::{Compat, Future01CompatExt}, executor::block_on, - future::{self, BoxFuture, FutureExt, TryFutureExt}, + future::{self, BoxFuture, FutureExt, TryFlattenStream, TryFutureExt}, sink::SinkExt, - stream::StreamExt, + stream::{ErrInto, StreamExt}, + TryStreamExt, }; -use grpcio::{CallOption, EnvBuilder, Environment, WriteFlags}; +use grpcio::{EnvBuilder, Environment, WriteFlags}; use kvproto::{ + meta_storagepb::{ + self as mpb, GetRequest, GetResponse, PutRequest, WatchRequest, WatchResponse, + }, metapb, pdpb::{self, Member}, replication_modepb::{RegionReplicationStatus, ReplicationStatus, StoreDrAutoSyncStatus}, }; use security::SecurityManager; use tikv_util::{ - box_err, debug, error, info, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, Either, HandyRwLock, + box_err, debug, error, info, thd_name, time::Instant, timer::GLOBAL_TIMER_HANDLE, warn, Either, + HandyRwLock, }; use txn_types::TimeStamp; use yatp::{task::future::TaskCell, ThreadPool}; use super::{ + meta_storage::{Get, MetaStorageClient, Put, Watch}, metrics::*, - util::{check_resp_header, sync_request, Client, PdConnector}, + util::{call_option_inner, check_resp_header, sync_request, Client, PdConnector}, BucketStat, Config, Error, FeatureGate, PdClient, PdFuture, RegionInfo, RegionStat, Result, UnixSecs, REQUEST_TIMEOUT, }; -const CQ_COUNT: usize = 1; -const CLIENT_PREFIX: &str = "pd"; +pub const CQ_COUNT: usize = 1; +pub const CLIENT_PREFIX: &str = "pd"; +#[derive(Clone)] pub struct RpcClient { cluster_id: u64, pd_client: Arc, @@ -86,7 +89,7 @@ impl RpcClient { ); let pd_connector = PdConnector::new(env.clone(), security_mgr.clone()); for i in 0..retries { - match pd_connector.validate_endpoints(cfg).await { + match pd_connector.validate_endpoints(cfg, true).await { Ok((client, target, members, tso)) => { let cluster_id = members.get_header().get_cluster_id(); let rpc_client = RpcClient { @@ -97,7 +100,7 @@ impl RpcClient { client, members, target, - tso, + tso.unwrap(), cfg.enable_forwarding, )), monitor: monitor.clone(), @@ -189,39 +192,27 @@ impl RpcClient { block_on(self.pd_client.reconnect(true)) } - /// Creates a new call option with default request timeout. - #[inline] - pub fn call_option(client: &Client) -> CallOption { - client - .inner - .rl() - .target_info() - .call_option() - .timeout(Duration::from_secs(REQUEST_TIMEOUT)) - } - /// Gets given key's Region and Region's leader from PD. fn get_region_and_leader( &self, key: &[u8], ) -> PdFuture<(metapb::Region, Option)> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_region.start_coarse_timer(); let mut req = pdpb::GetRegionRequest::default(); req.set_header(self.header()); req.set_region_key(key.to_vec()); let executor = move |client: &Client, req: pdpb::GetRegionRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_region_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_region_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) + }) + }; Box::pin(async move { let mut resp = handler.await?; @@ -253,18 +244,21 @@ impl RpcClient { req.set_store_id(store_id); let executor = move |client: &Client, req: pdpb::GetStoreRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_store_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "get_store_async", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_store_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_store_async", e) + }) + }; Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_async"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_async + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; let store = resp.take_store(); if store.get_state() != metapb::StoreState::Tombstone { @@ -281,6 +275,41 @@ impl RpcClient { } } +fn get_region_resp_by_id( + pd_client: Arc, + header: pdpb::RequestHeader, + region_id: u64, +) -> PdFuture { + let timer = Instant::now(); + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_header(header); + req.set_region_id(region_id); + + let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_region_by_id_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + }; + Box::pin(async move { + let resp = handler.await?; + PD_REQUEST_HISTOGRAM_VEC + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); + check_resp_header(resp.get_header())?; + Ok(resp) + }) as PdFuture<_> + }; + + pd_client + .request(req, executor, LEADER_CHANGE_RETRY) + .execute() +} + impl fmt::Debug for RpcClient { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("RpcClient") @@ -293,10 +322,47 @@ impl fmt::Debug for RpcClient { const LEADER_CHANGE_RETRY: usize = 10; impl PdClient for RpcClient { - fn load_global_config(&self, list: Vec) -> PdFuture> { - use kvproto::pdpb::LoadGlobalConfigRequest; - let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + fn store_global_config( + &self, + config_path: String, + items: Vec, + ) -> PdFuture<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .store_global_config + .start_coarse_timer(); + + let mut req = pdpb::StoreGlobalConfigRequest::new(); + req.set_config_path(config_path); + req.set_changes(items.into()); + let executor = move |client: &Client, req| match client + .inner + .rl() + .client_stub + .store_global_config_async(&req) + { + Ok(grpc_response) => Box::pin(async move { + if let Err(err) = grpc_response.await { + return Err(box_err!("{:?}", err)); + } + Ok(()) + }) as PdFuture<_>, + Err(err) => Box::pin(async move { Err(box_err!("{:?}", err)) }) as PdFuture<_>, + }; + self.pd_client + .request(req, executor, LEADER_CHANGE_RETRY) + .execute() + } + + fn load_global_config( + &self, + config_path: String, + ) -> PdFuture<(Vec, i64)> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .load_global_config + .start_coarse_timer(); + + let mut req = pdpb::LoadGlobalConfigRequest::new(); + req.set_config_path(config_path); let executor = |client: &Client, req| match client .inner .rl() @@ -306,21 +372,20 @@ impl PdClient for RpcClient { { Ok(grpc_response) => Box::pin(async move { match grpc_response.await { - Ok(grpc_response) => { - let mut res = HashMap::with_capacity(grpc_response.get_items().len()); - for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } - } - Ok(res) - } + Ok(grpc_response) => Ok(( + Vec::from(grpc_response.get_items()), + grpc_response.get_revision(), + )), Err(err) => Err(box_err!("{:?}", err)), } }) as PdFuture<_>, - Err(err) => Box::pin(async move { Err(box_err!("{:?}", err)) }) as PdFuture<_>, + Err(err) => Box::pin(async move { + Err(box_err!( + "load global config failed, path: '{}', err: {:?}", + req.get_config_path(), + err + )) + }) as PdFuture<_>, }; self.pd_client .request(req, executor, LEADER_CHANGE_RETRY) @@ -329,10 +394,18 @@ impl PdClient for RpcClient { fn watch_global_config( &self, + config_path: String, + revision: i64, ) -> Result> { - use kvproto::pdpb::WatchGlobalConfigRequest; - let req = WatchGlobalConfigRequest::default(); - sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .watch_global_config + .start_coarse_timer(); + + let mut req = pdpb::WatchGlobalConfigRequest::default(); + info!("[global_config] start watch global config"; "path" => &config_path, "revision" => revision); + req.set_config_path(config_path); + req.set_revision(revision); + sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, _| { client.watch_global_config(&req) }) } @@ -347,7 +420,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); let mut req = pdpb::BootstrapRequest::default(); @@ -355,8 +428,8 @@ impl PdClient for RpcClient { req.set_store(stores); req.set_region(region); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.bootstrap_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.bootstrap_opt(&req, option) })?; check_resp_header(resp.get_header())?; Ok(resp.replication_status.take()) @@ -364,14 +437,14 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); let mut req = pdpb::IsBootstrappedRequest::default(); req.set_header(self.header()); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.is_bootstrapped_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.is_bootstrapped_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -379,32 +452,48 @@ impl PdClient for RpcClient { } fn alloc_id(&self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); let mut req = pdpb::AllocIdRequest::default(); req.set_header(self.header()); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.alloc_id_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.alloc_id_opt(&req, option) })?; check_resp_header(resp.get_header())?; - Ok(resp.get_id()) + let id = resp.get_id(); + if id == 0 { + return Err(box_err!("pd alloc weird id 0")); + } + Ok(id) } - fn put_store(&self, store: metapb::Store) -> Result> { + fn is_recovering_marked(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) + .is_recovering_marked .start_coarse_timer(); + let mut req = pdpb::IsSnapshotRecoveringRequest::default(); + req.set_header(self.header()); + + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.is_snapshot_recovering_opt(&req, option) + })?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_marked()) + } + + fn put_store(&self, store: metapb::Store) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); + let mut req = pdpb::PutStoreRequest::default(); req.set_header(self.header()); req.set_store(store); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.put_store_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.put_store_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -412,16 +501,14 @@ impl PdClient for RpcClient { } fn get_store(&self, store_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_store.start_coarse_timer(); let mut req = pdpb::GetStoreRequest::default(); req.set_header(self.header()); req.set_store_id(store_id); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_store_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_store_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -438,16 +525,14 @@ impl PdClient for RpcClient { } fn get_all_stores(&self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); let mut req = pdpb::GetAllStoresRequest::default(); req.set_header(self.header()); req.set_exclude_tombstone_stores(exclude_tombstone); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_all_stores_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_all_stores_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -456,14 +541,14 @@ impl PdClient for RpcClient { fn get_cluster_config(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); let mut req = pdpb::GetClusterConfigRequest::default(); req.set_header(self.header()); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_cluster_config_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_cluster_config_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -488,77 +573,46 @@ impl PdClient for RpcClient { .boxed() } - fn get_region_by_id(&self, region_id: u64) -> PdFuture> { - let timer = Instant::now(); - - let mut req = pdpb::GetRegionByIdRequest::default(); - req.set_header(self.header()); - req.set_region_id(region_id); - - let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_region_by_id_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e) - }); - Box::pin(async move { - let mut resp = handler.await?; - PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); - check_resp_header(resp.get_header())?; - if resp.has_region() { - Ok(Some(resp.take_region())) - } else { - Ok(None) - } - }) as PdFuture<_> - }; + fn get_buckets_by_id(&self, region_id: u64) -> PdFuture> { + let header = self.header(); + let pd_client = self.pd_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, header, region_id).await?; + if resp.has_buckets() { + Ok(Some(resp.take_buckets())) + } else { + Ok(None) + } + }) as PdFuture> + } - self.pd_client - .request(req, executor, LEADER_CHANGE_RETRY) - .execute() + fn get_region_by_id(&self, region_id: u64) -> PdFuture> { + let header = self.header(); + let pd_client = self.pd_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, header, region_id).await?; + if resp.has_region() { + Ok(Some(resp.take_region())) + } else { + Ok(None) + } + }) } fn get_region_leader_by_id( &self, region_id: u64, ) -> PdFuture> { - let timer = Instant::now(); - - let mut req = pdpb::GetRegionByIdRequest::default(); - req.set_header(self.header()); - req.set_region_id(region_id); - - let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_region_by_id_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e) - }); - Box::pin(async move { - let mut resp = handler.await?; - PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); - check_resp_header(resp.get_header())?; - if resp.has_region() && resp.has_leader() { - Ok(Some((resp.take_region(), resp.take_leader()))) - } else { - Ok(None) - } - }) as PdFuture<_> - }; - - self.pd_client - .request(req, executor, LEADER_CHANGE_RETRY) - .execute() + let header = self.header(); + let pd_client = self.pd_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, header, region_id).await?; + if resp.has_region() && resp.has_leader() { + Ok(Some((resp.take_region(), resp.take_leader()))) + } else { + Ok(None) + } + }) } fn region_heartbeat( @@ -619,6 +673,9 @@ impl PdClient for RpcClient { if last > last_report { last_report = last - 1; } + fail::fail_point!("region_heartbeat_send_failed", |_| { + Err(Error::Grpc(grpcio::Error::RemoteStopped)) + }); Ok((r, WriteFlags::default())) })) .await; @@ -643,7 +700,8 @@ impl PdClient for RpcClient { .expect("expect region heartbeat sender"); let ret = sender .unbounded_send(req) - .map_err(|e| Error::Other(Box::new(e))); + .map_err(|e| Error::StreamDisconnect(e.into_send_error())); + Box::pin(future::ready(ret)) as PdFuture<_> }; @@ -667,18 +725,19 @@ impl PdClient for RpcClient { req.set_region(region); let executor = move |client: &Client, req: pdpb::AskSplitRequest| { - let handler = client - .inner - .rl() - .client_stub - .ask_split_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_split", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .ask_split_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_split", e)) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -702,18 +761,21 @@ impl PdClient for RpcClient { req.set_split_count(count as u32); let executor = move |client: &Client, req: pdpb::AskBatchSplitRequest| { - let handler = client - .inner - .rl() - .client_stub - .ask_batch_split_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_batch_split", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .ask_batch_split_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_batch_split", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -746,17 +808,20 @@ impl PdClient for RpcClient { } let executor = move |client: &Client, req: pdpb::StoreHeartbeatRequest| { let feature_gate = client.feature_gate.clone(); - let handler = client - .inner - .rl() - .client_stub - .store_heartbeat_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "store_heartbeat", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .store_heartbeat_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "store_heartbeat", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), @@ -780,19 +845,20 @@ impl PdClient for RpcClient { req.set_regions(regions.into()); let executor = move |client: &Client, req: pdpb::ReportBatchSplitRequest| { - let handler = client - .inner - .rl() - .client_stub - .report_batch_split_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "report_batch_split", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .report_batch_split_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_batch_split", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -804,9 +870,7 @@ impl PdClient for RpcClient { } fn scatter_region(&self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_header(self.header()); @@ -816,8 +880,8 @@ impl PdClient for RpcClient { } req.set_region(region.region); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.scatter_region_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.scatter_region_opt(&req, option) })?; check_resp_header(resp.get_header()) } @@ -833,20 +897,20 @@ impl PdClient for RpcClient { req.set_header(self.header()); let executor = move |client: &Client, req: pdpb::GetGcSafePointRequest| { - let option = Self::call_option(client); - let handler = client - .inner - .rl() - .client_stub - .get_gc_safe_point_async_opt(&req, option) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_gc_safe_point_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) }) as PdFuture<_> @@ -862,16 +926,14 @@ impl PdClient for RpcClient { } fn get_operator(&self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); let mut req = pdpb::GetOperatorRequest::default(); req.set_header(self.header()); req.set_region_id(region_id); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_operator_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_operator_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -879,7 +941,7 @@ impl PdClient for RpcClient { } fn batch_get_tso(&self, count: u32) -> PdFuture { - let begin = Instant::now(); + let timer = Instant::now(); let executor = move |client: &Client, _| { // Remove Box::pin and Compat when GLOBAL_TIMER_HANDLE supports futures 0.3 let ts_fut = Compat::new(Box::pin(client.inner.rl().tso.get_timestamp(count))); @@ -898,8 +960,8 @@ impl PdClient for RpcClient { } })?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["tso"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .tso + .observe(timer.saturating_elapsed_secs()); Ok(ts) }) as PdFuture<_> }; @@ -914,29 +976,30 @@ impl PdClient for RpcClient { safe_point: TimeStamp, ttl: Duration, ) -> PdFuture<()> { - let begin = Instant::now(); + let timer = Instant::now(); let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); req.set_header(self.header()); req.set_service_id(name.into()); req.set_ttl(ttl.as_secs() as _); req.set_safe_point(safe_point.into_inner()); let executor = move |client: &Client, r: pdpb::UpdateServiceGcSafePointRequest| { - let handler = client - .inner - .rl() - .client_stub - .update_service_gc_safe_point_async_opt(&r, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!( - "fail to request PD {} err {:?}", - "update_service_safe_point", e - ) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .update_service_gc_safe_point_async_opt(&r, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "update_service_safe_point", e + ) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -959,17 +1022,20 @@ impl PdClient for RpcClient { req.set_min_resolved_ts(min_resolved_ts); let executor = move |client: &Client, req: pdpb::ReportMinResolvedTsRequest| { - let handler = client - .inner - .rl() - .client_stub - .report_min_resolved_ts_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "min_resolved_ts", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .report_min_resolved_ts_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "min_resolved_ts", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -1048,7 +1114,7 @@ impl PdClient for RpcClient { .expect("expect region buckets sender"); let ret = sender .unbounded_send(req) - .map_err(|e| Error::Other(Box::new(e))); + .map_err(|e| Error::StreamDisconnect(e.into_send_error())); Box::pin(future::ready(ret)) as PdFuture<_> }; @@ -1058,26 +1124,90 @@ impl PdClient for RpcClient { } } -pub struct DummyPdClient { - pub next_ts: TimeStamp, +impl RpcClient { + fn fill_cluster_id_for(&self, header: &mut mpb::RequestHeader) { + header.cluster_id = self.cluster_id; + } } -impl DummyPdClient { - pub fn new() -> DummyPdClient { - DummyPdClient { - next_ts: TimeStamp::zero(), - } +impl MetaStorageClient for RpcClient { + fn get(&self, mut req: Get) -> PdFuture { + let timer = Instant::now(); + self.fill_cluster_id_for(req.inner.mut_header()); + let executor = move |client: &Client, req: GetRequest| { + let handler = { + let inner = client.inner.rl(); + let r = inner + .meta_storage + .get_async_opt(&req, call_option_inner(&inner)); + futures::future::ready(r).err_into().try_flatten() + }; + Box::pin(async move { + fail::fail_point!("meta_storage_get", req.key.ends_with(b"rejectme"), |_| { + Err(super::Error::Grpc(grpcio::Error::RemoteStopped)) + }); + let resp = handler.await?; + PD_REQUEST_HISTOGRAM_VEC + .meta_storage_get + .observe(timer.saturating_elapsed_secs()); + Ok(resp) + }) as _ + }; + + self.pd_client + .request(req.into(), executor, LEADER_CHANGE_RETRY) + .execute() } -} -impl Default for DummyPdClient { - fn default() -> Self { - Self::new() + fn put(&self, mut req: Put) -> PdFuture { + let timer = Instant::now(); + self.fill_cluster_id_for(req.inner.mut_header()); + let executor = move |client: &Client, req: PutRequest| { + let handler = { + let inner = client.inner.rl(); + let r = inner + .meta_storage + .put_async_opt(&req, call_option_inner(&inner)); + futures::future::ready(r).err_into().try_flatten() + }; + Box::pin(async move { + let resp = handler.await?; + PD_REQUEST_HISTOGRAM_VEC + .meta_storage_put + .observe(timer.saturating_elapsed_secs()); + Ok(resp) + }) as _ + }; + + self.pd_client + .request(req.into(), executor, LEADER_CHANGE_RETRY) + .execute() } -} -impl PdClient for DummyPdClient { - fn batch_get_tso(&self, _count: u32) -> PdFuture { - Box::pin(future::ok(self.next_ts)) + fn watch(&self, mut req: Watch) -> Self::WatchStream { + let timer = Instant::now(); + self.fill_cluster_id_for(req.inner.mut_header()); + let executor = move |client: &Client, req: WatchRequest| { + let handler = { + let inner = client.inner.rl(); + inner.meta_storage.watch(&req) + }; + Box::pin(async move { + let resp = handler?; + PD_REQUEST_HISTOGRAM_VEC + .meta_storage_watch + .observe(timer.saturating_elapsed_secs()); + Ok(resp.err_into()) + }) as _ + }; + + self.pd_client + .request(req.into(), executor, LEADER_CHANGE_RETRY) + .execute() + .try_flatten_stream() } + + type WatchStream = TryFlattenStream< + PdFuture, crate::Error>>, + >; } diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs new file mode 100644 index 00000000000..11224ad894e --- /dev/null +++ b/components/pd_client/src/client_v2.rs @@ -0,0 +1,1385 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! PD Client V2 +//! +//! In V1, the connection to PD and related states are all shared under a +//! `RwLock`. The maintenance of these states are implemented in a +//! decentralized way: each request will try to rebuild the connection on its +//! own if it encounters a network error. +//! +//! In V2, the responsibility to maintain the connection is moved into one +//! single long-running coroutine, namely [`reconnect_loop`]. Users of the +//! connection subscribe changes instead of altering it themselves. + +use std::{ + collections::HashMap, + fmt::Debug, + pin::Pin, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant as StdInstant}, + u64, +}; + +use fail::fail_point; +use futures::{ + compat::{Compat, Future01CompatExt}, + executor::block_on, + future::FutureExt, + select, + sink::SinkExt, + stream::{Stream, StreamExt}, + task::{Context, Poll}, +}; +use grpcio::{ + CallOption, Channel, ClientDuplexReceiver, ConnectivityState, EnvBuilder, Environment, + Error as GrpcError, Result as GrpcResult, WriteFlags, +}; +use kvproto::{ + metapb, + pdpb::{ + self, GetMembersResponse, PdClient as PdClientStub, RegionHeartbeatRequest, + RegionHeartbeatResponse, ReportBucketsRequest, TsoRequest, TsoResponse, + }, + replication_modepb::{ReplicationStatus, StoreDrAutoSyncStatus}, +}; +use security::SecurityManager; +use tikv_util::{ + box_err, error, info, mpsc::future as mpsc, slow_log, thd_name, time::Instant, + timer::GLOBAL_TIMER_HANDLE, warn, +}; +use tokio::sync::{broadcast, mpsc as tokio_mpsc}; +use txn_types::TimeStamp; + +use super::{ + client::{CLIENT_PREFIX, CQ_COUNT}, + metrics::*, + util::{check_resp_header, PdConnector, TargetInfo}, + Config, Error, FeatureGate, RegionInfo, Result, UnixSecs, + REQUEST_TIMEOUT as REQUEST_TIMEOUT_SEC, +}; +use crate::PdFuture; + +fn request_timeout() -> Duration { + fail_point!("pd_client_v2_request_timeout", |s| { + use std::str::FromStr; + + use tikv_util::config::ReadableDuration; + ReadableDuration::from_str(&s.unwrap()).unwrap().0 + }); + Duration::from_secs(REQUEST_TIMEOUT_SEC) +} + +/// Immutable context for making new connections. +struct ConnectContext { + cfg: Config, + connector: PdConnector, +} + +#[derive(Clone)] +struct RawClient { + stub: PdClientStub, + target_info: TargetInfo, + members: GetMembersResponse, +} + +impl RawClient { + async fn connect(ctx: &ConnectContext) -> Result { + // -1 means the max. + let retries = match ctx.cfg.retry_max_count { + -1 => std::isize::MAX, + v => v.saturating_add(1), + }; + for i in 0..retries { + match ctx.connector.validate_endpoints(&ctx.cfg, false).await { + Ok((stub, target_info, members, _)) => { + return Ok(RawClient { + stub, + target_info, + members, + }); + } + Err(e) => { + if i as usize % ctx.cfg.retry_log_every == 0 { + warn!("validate PD endpoints failed"; "err" => ?e); + } + let _ = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + ctx.cfg.retry_interval.0) + .compat() + .await; + } + } + } + Err(box_err!("PD endpoints are invalid")) + } + + /// Returns Ok(true) when a new connection is established. + async fn maybe_reconnect(&mut self, ctx: &ConnectContext, force: bool) -> Result { + PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + let start = Instant::now(); + + let members = self.members.clone(); + let direct_connected = self.target_info.direct_connected(); + slow_log!(start.saturating_elapsed(), "try reconnect pd"); + let (stub, target_info, members, _) = match ctx + .connector + .reconnect_pd( + members, + direct_connected, + force, + ctx.cfg.enable_forwarding, + false, + ) + .await + { + Err(e) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["failure"]) + .inc(); + return Err(e); + } + Ok(None) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["no-need"]) + .inc(); + return Ok(false); + } + Ok(Some(tuple)) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["success"]) + .inc(); + tuple + } + }; + + fail_point!("pd_client_v2_reconnect", |_| Ok(true)); + + self.stub = stub; + self.target_info = target_info; + self.members = members; + + info!("trying to update PD client done"; "spend" => ?start.saturating_elapsed()); + Ok(true) + } +} + +struct CachedRawClientCore { + context: ConnectContext, + + latest: Mutex, + version: AtomicU64, + on_reconnect_tx: broadcast::Sender<()>, +} + +/// A shared [`RawClient`] with a local copy of cache. +pub struct CachedRawClient { + core: Arc, + should_reconnect_tx: broadcast::Sender, + on_reconnect_rx: broadcast::Receiver<()>, + + cache: RawClient, + cache_version: u64, +} + +impl Clone for CachedRawClient { + fn clone(&self) -> Self { + Self { + core: self.core.clone(), + should_reconnect_tx: self.should_reconnect_tx.clone(), + on_reconnect_rx: self.core.on_reconnect_tx.subscribe(), + cache: self.cache.clone(), + cache_version: self.cache_version, + } + } +} + +impl CachedRawClient { + fn new( + cfg: Config, + env: Arc, + security_mgr: Arc, + should_reconnect_tx: broadcast::Sender, + ) -> Self { + let lame_stub = PdClientStub::new(Channel::lame(env.clone(), "0.0.0.0:0")); + let client = RawClient { + stub: lame_stub, + target_info: TargetInfo::new("0.0.0.0:0".to_string(), ""), + members: GetMembersResponse::new(), + }; + let context = ConnectContext { + cfg, + connector: PdConnector::new(env, security_mgr), + }; + let (tx, rx) = broadcast::channel(1); + let core = CachedRawClientCore { + context, + latest: Mutex::new(client.clone()), + version: AtomicU64::new(0), + on_reconnect_tx: tx, + }; + Self { + core: Arc::new(core), + should_reconnect_tx, + on_reconnect_rx: rx, + cache: client, + cache_version: 0, + } + } + + #[inline] + fn refresh_cache(&mut self) -> bool { + if self.cache_version < self.core.version.load(Ordering::Acquire) { + let latest = self.core.latest.lock().unwrap(); + self.cache = (*latest).clone(); + self.cache_version = self.core.version.load(Ordering::Relaxed); + true + } else { + false + } + } + + #[inline] + fn publish_cache(&mut self) { + let latest_version = { + let mut latest = self.core.latest.lock().unwrap(); + *latest = self.cache.clone(); + let v = self.core.version.fetch_add(1, Ordering::Relaxed) + 1; + let _ = self.core.on_reconnect_tx.send(()); + v + }; + debug_assert!(self.cache_version < latest_version); + self.cache_version = latest_version; + } + + #[inline] + async fn wait_for_a_new_client( + rx: &mut broadcast::Receiver<()>, + current_version: u64, + latest_version: &AtomicU64, + ) -> bool { + let deadline = StdInstant::now() + request_timeout(); + loop { + if GLOBAL_TIMER_HANDLE + .timeout(Compat::new(Box::pin(rx.recv())), deadline) + .compat() + .await + .is_ok() + { + if current_version < latest_version.load(Ordering::Acquire) { + return true; + } + } else { + return false; + } + } + } + + /// Refreshes the local cache with latest client, then waits for the + /// connection to be ready. + /// The connection must be available if this function returns `Ok(())`. + async fn wait_for_ready(&mut self) -> Result<()> { + self.refresh_cache(); + if self.channel().check_connectivity_state(false) == ConnectivityState::GRPC_CHANNEL_READY { + return Ok(()); + } + select! { + r = self + .cache + .stub + .client + .channel() + .wait_for_connected(request_timeout()) + .fuse() => + { + if r { + return Ok(()); + } + } + r = Self::wait_for_a_new_client( + &mut self.on_reconnect_rx, + self.cache_version, + &self.core.version, + ).fuse() => { + if r { + assert!(self.refresh_cache()); + return Ok(()); + } + } + } + let _ = self.should_reconnect_tx.send(self.cache_version); + Err(box_err!( + "Connection unavailable {:?}", + self.channel().check_connectivity_state(false) + )) + } + + /// Makes the first connection. + async fn connect(&mut self) -> Result<()> { + self.cache = RawClient::connect(&self.core.context).await?; + self.publish_cache(); + Ok(()) + } + + /// Increases global version only when a new connection is established. + /// Might panic if `wait_for_ready` isn't called up-front. + async fn reconnect(&mut self) -> Result { + let force = (|| { + fail_point!("pd_client_force_reconnect", |_| true); + self.channel().check_connectivity_state(true) + == ConnectivityState::GRPC_CHANNEL_SHUTDOWN + })(); + if self + .cache + .maybe_reconnect(&self.core.context, force) + .await? + { + self.publish_cache(); + return Ok(true); + } + Ok(false) + } + + #[inline] + fn check_resp(&mut self, resp: GrpcResult) -> GrpcResult { + if matches!( + resp, + Err(GrpcError::RpcFailure(_) | GrpcError::RemoteStopped | GrpcError::RpcFinished(_)) + ) { + let _ = self.should_reconnect_tx.send(self.cache_version); + } + resp + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn stub(&self) -> &PdClientStub { + &self.cache.stub + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn channel(&self) -> &Channel { + self.cache.stub.client.channel() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn call_option(&self) -> CallOption { + self.cache.target_info.call_option() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn cluster_id(&self) -> u64 { + self.cache.members.get_header().get_cluster_id() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn header(&self) -> pdpb::RequestHeader { + let mut header = pdpb::RequestHeader::default(); + header.set_cluster_id(self.cluster_id()); + header + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[cfg(feature = "testexport")] + #[inline] + fn leader(&self) -> pdpb::Member { + self.cache.members.get_leader().clone() + } + + #[inline] + fn initialized(&self) -> bool { + self.cache_version != 0 + } +} + +async fn reconnect_loop( + mut client: CachedRawClient, + cfg: Config, + mut should_reconnect: broadcast::Receiver, +) { + if let Err(e) = client.connect().await { + error!("failed to connect pd"; "err" => ?e); + return; + } + let backoff = (|| { + fail_point!("pd_client_v2_backoff", |s| { + use std::str::FromStr; + + use tikv_util::config::ReadableDuration; + ReadableDuration::from_str(&s.unwrap()).unwrap().0 + }); + request_timeout() + })(); + let mut last_connect = StdInstant::now(); + loop { + if client.channel().wait_for_connected(request_timeout()).await { + let state = ConnectivityState::GRPC_CHANNEL_READY; + select! { + // Checks for leader change periodically. + _ = client + .channel() + .wait_for_state_change(state, cfg.update_interval.0) + .fuse() => {} + v = should_reconnect.recv().fuse() => { + match v { + Ok(v) if v < client.cache_version => continue, + Ok(_) => {} + Err(broadcast::error::RecvError::Lagged(_)) => continue, + Err(broadcast::error::RecvError::Closed) => break, + } + } + } + } + let target = last_connect + backoff; + if target > StdInstant::now() { + let _ = GLOBAL_TIMER_HANDLE.delay(target).compat().await; + } + last_connect = StdInstant::now(); + if let Err(e) = client.reconnect().await { + warn!("failed to reconnect pd"; "err" => ?e); + } + } +} + +#[derive(Clone)] +pub struct RpcClient { + pub raw_client: CachedRawClient, + feature_gate: FeatureGate, +} + +impl RpcClient { + pub fn new( + cfg: &Config, + shared_env: Option>, + security_mgr: Arc, + ) -> Result { + let env = shared_env.unwrap_or_else(|| { + Arc::new( + EnvBuilder::new() + .cq_count(CQ_COUNT) + .name_prefix(thd_name!(CLIENT_PREFIX)) + .build(), + ) + }); + + // Use broadcast channel for the lagging feature. + let (tx, rx) = broadcast::channel(1); + let raw_client = CachedRawClient::new(cfg.clone(), env, security_mgr, tx); + raw_client + .stub() + .spawn(reconnect_loop(raw_client.clone(), cfg.clone(), rx)); + + Ok(Self { + raw_client, + feature_gate: Default::default(), + }) + } + + #[inline] + pub fn subscribe_reconnect(&self) -> broadcast::Receiver<()> { + self.raw_client.clone().on_reconnect_rx + } + + #[cfg(feature = "testexport")] + pub fn feature_gate(&self) -> &FeatureGate { + &self.feature_gate + } + + #[cfg(feature = "testexport")] + pub fn get_leader(&mut self) -> pdpb::Member { + block_on(self.raw_client.wait_for_ready()).unwrap(); + self.raw_client.leader() + } + + #[cfg(feature = "testexport")] + pub fn reconnect(&mut self) -> Result { + block_on(self.raw_client.wait_for_ready())?; + block_on(self.raw_client.reconnect()) + } + + #[cfg(feature = "testexport")] + pub fn reset_to_lame_client(&mut self) { + let env = self.raw_client.core.context.connector.env.clone(); + let lame = PdClientStub::new(Channel::lame(env, "0.0.0.0:0")); + self.raw_client.core.latest.lock().unwrap().stub = lame.clone(); + self.raw_client.cache.stub = lame; + } + + #[cfg(feature = "testexport")] + pub fn initialized(&self) -> bool { + self.raw_client.initialized() + } +} + +async fn get_region_resp_by_id( + mut raw_client: CachedRawClient, + region_id: u64, +) -> Result { + let timer = Instant::now_coarse(); + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_region_id(region_id); + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_by_id_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) +} +pub trait PdClient { + type ResponseChannel: Stream>; + + fn create_region_heartbeat_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<( + mpsc::Sender, + Self::ResponseChannel, + )>; + + fn create_report_region_buckets_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result>; + + fn create_tso_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<(mpsc::Sender, Self::ResponseChannel)>; + + fn fetch_cluster_id(&mut self) -> Result; + + fn load_global_config(&mut self, config_path: String) -> PdFuture>; + + fn watch_global_config( + &mut self, + ) -> Result>; + + fn bootstrap_cluster( + &mut self, + stores: metapb::Store, + region: metapb::Region, + ) -> Result>; + + fn is_cluster_bootstrapped(&mut self) -> Result; + + fn alloc_id(&mut self) -> Result; + + fn is_recovering_marked(&mut self) -> Result; + + fn put_store(&mut self, store: metapb::Store) -> Result>; + + fn get_store_and_stats(&mut self, store_id: u64) + -> PdFuture<(metapb::Store, pdpb::StoreStats)>; + + fn get_store(&mut self, store_id: u64) -> Result { + block_on(self.get_store_and_stats(store_id)).map(|r| r.0) + } + + fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result>; + + fn get_cluster_config(&mut self) -> Result; + + fn get_region_and_leader( + &mut self, + key: &[u8], + ) -> PdFuture<(metapb::Region, Option)>; + + fn get_region(&mut self, key: &[u8]) -> Result { + block_on(self.get_region_and_leader(key)).map(|r| r.0) + } + + fn get_region_info(&mut self, key: &[u8]) -> Result { + block_on(self.get_region_and_leader(key)).map(|r| RegionInfo::new(r.0, r.1)) + } + + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture>; + + fn get_buckets_by_id(&self, region_id: u64) -> PdFuture>; + + fn get_region_leader_by_id( + &mut self, + region_id: u64, + ) -> PdFuture>; + + fn ask_split(&mut self, region: metapb::Region) -> PdFuture; + + fn ask_batch_split( + &mut self, + region: metapb::Region, + count: usize, + ) -> PdFuture; + + fn store_heartbeat( + &mut self, + stats: pdpb::StoreStats, + store_report: Option, + dr_autosync_status: Option, + ) -> PdFuture; + + fn report_batch_split(&mut self, regions: Vec) -> PdFuture<()>; + + fn scatter_region(&mut self, region: RegionInfo) -> Result<()>; + + fn get_gc_safe_point(&mut self) -> PdFuture; + + fn get_operator(&mut self, region_id: u64) -> Result; + + fn update_service_safe_point( + &mut self, + name: String, + safe_point: TimeStamp, + ttl: Duration, + ) -> PdFuture<()>; + + fn report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) -> PdFuture<()>; +} + +pub struct CachedDuplexResponse { + latest: tokio_mpsc::Receiver>, + cache: Option>, +} + +impl CachedDuplexResponse { + fn new() -> (tokio_mpsc::Sender>, Self) { + let (tx, rx) = tokio_mpsc::channel(1); + ( + tx, + Self { + latest: rx, + cache: None, + }, + ) + } +} + +impl Stream for CachedDuplexResponse { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + if let Some(ref mut receiver) = self.cache { + match Pin::new(receiver).poll_next(cx) { + Poll::Ready(Some(Ok(item))) => return Poll::Ready(Some(Ok(item))), + Poll::Pending => return Poll::Pending, + // If it's None or there's error, we need to update receiver. + _ => {} + } + } + + match Pin::new(&mut self.latest).poll_recv(cx) { + Poll::Ready(Some(receiver)) => self.cache = Some(receiver), + Poll::Ready(None) => return Poll::Ready(None), + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl PdClient for RpcClient { + type ResponseChannel = CachedDuplexResponse; + + fn create_region_heartbeat_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<( + mpsc::Sender, + Self::ResponseChannel, + )> { + // TODO: use bounded channel. + let (tx, rx) = mpsc::unbounded(wake_policy); + let (resp_tx, resp_rx) = CachedDuplexResponse::::new(); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| { + fail::fail_point!("region_heartbeat_send_failed", |_| { + Err(grpcio::Error::RemoteStopped) + }); + Ok((r, WriteFlags::default())) + }); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for RegionHeartbeat stream"; "err" => ?e); + continue; + } + let (mut hb_tx, hb_rx) = raw_client + .stub() + .region_heartbeat_opt(raw_client.call_option()) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "region_heartbeat", e) + }); + if resp_tx.send(hb_rx).await.is_err() { + break; + } + let res = hb_tx.send_all(&mut requests).await; + if res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(res); + warn!("region heartbeat stream exited"; "res" => ?res); + } + let _ = hb_tx.close().await; + } + }); + Ok((tx, resp_rx)) + } + + fn create_report_region_buckets_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result> { + let (tx, rx) = mpsc::unbounded(wake_policy); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| Ok((r, WriteFlags::default()))); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for ReportRegionBuckets stream"; "err" => ?e); + continue; + } + let (mut bk_tx, bk_rx) = raw_client + .stub() + .report_buckets_opt(raw_client.call_option()) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_region_buckets", e) + }); + select! { + send_res = bk_tx.send_all(&mut requests).fuse() => { + if send_res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(send_res); + warn!("region buckets stream exited: {:?}", res); + } + } + recv_res = bk_rx.fuse() => { + let res = raw_client.check_resp(recv_res); + warn!("region buckets stream exited: {:?}", res); + } + } + let _ = bk_tx.close().await; + } + }); + Ok(tx) + } + + fn create_tso_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<(mpsc::Sender, Self::ResponseChannel)> { + let (tx, rx) = mpsc::unbounded(wake_policy); + let (resp_tx, resp_rx) = CachedDuplexResponse::::new(); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| Ok((r, WriteFlags::default()))); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for Tso stream"; "err" => ?e); + continue; + } + let (mut tso_tx, tso_rx) = raw_client + .stub() + .tso_opt(raw_client.call_option()) + .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "tso", e)); + if resp_tx.send(tso_rx).await.is_err() { + break; + } + let res = tso_tx.send_all(&mut requests).await; + if res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(res); + warn!("tso exited"; "res" => ?res); + } + let _ = tso_tx.close().await; + } + }); + Ok((tx, resp_rx)) + } + + fn load_global_config(&mut self, config_path: String) -> PdFuture> { + use kvproto::pdpb::LoadGlobalConfigRequest; + let mut req = LoadGlobalConfigRequest::new(); + req.set_config_path(config_path); + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + let fut = raw_client.stub().load_global_config_async(&req)?; + match fut.await { + Ok(grpc_response) => { + let mut res = HashMap::with_capacity(grpc_response.get_items().len()); + for c in grpc_response.get_items() { + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); + } + Ok(res) + } + Err(err) => Err(box_err!("{:?}", err)), + } + }) + } + + fn watch_global_config( + &mut self, + ) -> Result> { + let req = pdpb::WatchGlobalConfigRequest::default(); + block_on(self.raw_client.wait_for_ready())?; + Ok(self.raw_client.stub().watch_global_config(&req)?) + } + + fn fetch_cluster_id(&mut self) -> Result { + if !self.raw_client.initialized() { + block_on(self.raw_client.wait_for_ready())?; + } + let id = self.raw_client.cluster_id(); + assert!(id > 0); + Ok(id) + } + + fn bootstrap_cluster( + &mut self, + stores: metapb::Store, + region: metapb::Region, + ) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .bootstrap_cluster + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::BootstrapRequest::default(); + req.set_header(self.raw_client.header()); + req.set_store(stores); + req.set_region(region); + + let resp = self.raw_client.stub().bootstrap_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp.replication_status.take()) + } + + fn is_cluster_bootstrapped(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .is_cluster_bootstrapped + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::IsBootstrappedRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().is_bootstrapped_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_bootstrapped()) + } + + fn alloc_id(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::AllocIdRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().alloc_id_opt( + &req, + self.raw_client + .call_option() + .timeout(Duration::from_secs(10)), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + let id = resp.get_id(); + if id == 0 { + return Err(box_err!("pd alloc weird id 0")); + } + Ok(id) + } + + fn is_recovering_marked(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .is_recovering_marked + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::IsSnapshotRecoveringRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().is_snapshot_recovering_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_marked()) + } + + fn put_store(&mut self, store: metapb::Store) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::PutStoreRequest::default(); + req.set_header(self.raw_client.header()); + req.set_store(store); + + let resp = self.raw_client.stub().put_store_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.replication_status.take()) + } + + fn get_store_and_stats( + &mut self, + store_id: u64, + ) -> PdFuture<(metapb::Store, pdpb::StoreStats)> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetStoreRequest::default(); + req.set_store_id(store_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_store_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_store_and_stats", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .get_store_and_stats + .observe(timer.saturating_elapsed_secs()); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + let store = resp.take_store(); + if store.get_state() != metapb::StoreState::Tombstone { + Ok((store, resp.take_stats())) + } else { + Err(Error::StoreTombstone(format!("{:?}", store))) + } + }) + } + + fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetAllStoresRequest::default(); + req.set_header(self.raw_client.header()); + req.set_exclude_tombstone_stores(exclude_tombstone); + + let resp = self.raw_client.stub().get_all_stores_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.take_stores().into()) + } + + fn get_cluster_config(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .get_cluster_config + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetClusterConfigRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().get_cluster_config_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.take_cluster()) + } + + fn get_region_and_leader( + &mut self, + key: &[u8], + ) -> PdFuture<(metapb::Region, Option)> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionRequest::default(); + req.set_region_key(key.to_vec()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .get_region + .observe(timer.saturating_elapsed_secs()); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + let region = if resp.has_region() { + resp.take_region() + } else { + return Err(Error::RegionNotFound(req.region_key)); + }; + let leader = if resp.has_leader() { + Some(resp.take_leader()) + } else { + None + }; + Ok((region, leader)) + }) + } + + fn get_buckets_by_id(&self, region_id: u64) -> PdFuture> { + let pd_client = self.raw_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, region_id).await?; + if resp.has_buckets() { + Ok(Some(resp.take_buckets())) + } else { + Ok(None) + } + }) + } + + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture> { + let pd_client = self.raw_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, region_id).await?; + if resp.has_region() { + Ok(Some(resp.take_region())) + } else { + Ok(None) + } + }) + } + + fn get_region_leader_by_id( + &mut self, + region_id: u64, + ) -> PdFuture> { + let pd_client = self.raw_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, region_id).await?; + if resp.has_region() && resp.has_leader() { + Ok(Some((resp.take_region(), resp.take_leader()))) + } else { + Ok(None) + } + }) + } + + fn ask_split(&mut self, region: metapb::Region) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::AskSplitRequest::default(); + req.set_region(region); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .ask_split_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .ask_split + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) + }) + } + + fn ask_batch_split( + &mut self, + region: metapb::Region, + count: usize, + ) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::AskBatchSplitRequest::default(); + req.set_region(region); + req.set_split_count(count as u32); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .ask_batch_split_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_batch_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) + }) + } + + fn store_heartbeat( + &mut self, + mut stats: pdpb::StoreStats, + store_report: Option, + dr_autosync_status: Option, + ) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::StoreHeartbeatRequest::default(); + stats + .mut_interval() + .set_end_timestamp(UnixSecs::now().into_inner()); + req.set_stats(stats); + if let Some(report) = store_report { + req.set_store_report(report); + } + if let Some(status) = dr_autosync_status { + req.set_dr_autosync_status(status); + } + + let mut raw_client = self.raw_client.clone(); + let feature_gate = self.feature_gate.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .store_heartbeat_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "store_heartbeat", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + match feature_gate.set_version(resp.get_cluster_version()) { + Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), + Ok(true) => info!("set cluster version to {}", resp.get_cluster_version()), + _ => {} + }; + Ok(resp) + }) + } + + fn report_batch_split(&mut self, regions: Vec) -> PdFuture<()> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::ReportBatchSplitRequest::default(); + req.set_regions(regions.into()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .report_batch_split_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_batch_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .report_batch_split + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } + + fn scatter_region(&mut self, mut region: RegionInfo) -> Result<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); + + let mut req = pdpb::ScatterRegionRequest::default(); + req.set_region_id(region.get_id()); + if let Some(leader) = region.leader.take() { + req.set_leader(leader); + } + req.set_region(region.region); + + block_on(self.raw_client.wait_for_ready())?; + req.set_header(self.raw_client.header()); + let resp = self.raw_client.stub().scatter_region_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header()) + } + + fn get_gc_safe_point(&mut self) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetGcSafePointRequest::default(); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_gc_safe_point_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp.get_safe_point()) + }) + } + + fn get_operator(&mut self, region_id: u64) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetOperatorRequest::default(); + req.set_header(self.raw_client.header()); + req.set_region_id(region_id); + + let resp = self.raw_client.stub().get_operator_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp) + } + + fn update_service_safe_point( + &mut self, + name: String, + safe_point: TimeStamp, + ttl: Duration, + ) -> PdFuture<()> { + let timer = Instant::now_coarse(); + let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); + req.set_service_id(name.into()); + req.set_ttl(ttl.as_secs() as _); + req.set_safe_point(safe_point.into_inner()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .update_service_gc_safe_point_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "update_service_safe_point", e + ); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } + + fn report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) -> PdFuture<()> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::ReportMinResolvedTsRequest::default(); + req.set_store_id(store_id); + req.set_min_resolved_ts(min_resolved_ts); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .report_min_resolved_ts_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "min_resolved_ts", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } +} diff --git a/components/pd_client/src/config.rs b/components/pd_client/src/config.rs index f11608117e8..a02c2272490 100644 --- a/components/pd_client/src/config.rs +++ b/components/pd_client/src/config.rs @@ -6,8 +6,8 @@ use serde_derive::{Deserialize, Serialize}; use tikv_util::config::ReadableDuration; /// The configuration for a PD Client. /// -/// By default during initialization the client will attempt to reconnect every 300s -/// for infinity, logging only every 10th duplicate error. +/// By default during initialization the client will attempt to reconnect every +/// 300s for infinity, logging only every 10th duplicate error. #[derive(Clone, Serialize, Deserialize, PartialEq, Debug)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -24,8 +24,8 @@ pub struct Config { /// /// Default is isize::MAX, represented by -1. pub retry_max_count: isize, - /// If the client observes the same error message on retry, it can repeat the message only - /// every `n` times. + /// If the client observes the same error message on retry, it can repeat + /// the message only every `n` times. /// /// Default is 10. Set to 1 to disable this feature. pub retry_log_every: usize, @@ -33,7 +33,8 @@ pub struct Config { /// /// Default is 10m. pub update_interval: ReadableDuration, - /// The switch to support forwarding requests to follower when the network partition problem happens. + /// The switch to support forwarding requests to follower when the network + /// partition problem happens. /// /// Default is false. pub enable_forwarding: bool, diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index b86edfc6e98..5bacca03354 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -3,6 +3,7 @@ use std::{error, result}; use error_code::{self, ErrorCode, ErrorCodeExt}; +use futures::channel::mpsc::SendError; use thiserror::Error; #[derive(Debug, Error)] @@ -15,6 +16,8 @@ pub enum Error { Incompatible, #[error("{0}")] Grpc(#[from] grpcio::Error), + #[error("{0}")] + StreamDisconnect(#[from] SendError), #[error("unknown error {0:?}")] Other(#[from] Box), #[error("region is not found for key {}", log_wrappers::Value::key(.0))] @@ -23,6 +26,8 @@ pub enum Error { StoreTombstone(String), #[error("global config item {0} not found")] GlobalConfigNotFound(String), + #[error("required watch revision is smaller than current compact/min revision. {0:?}")] + DataCompacted(String), } pub type Result = result::Result; @@ -30,8 +35,12 @@ pub type Result = result::Result; impl Error { pub fn retryable(&self) -> bool { match self { - Error::Grpc(_) | Error::Other(_) | Error::ClusterNotBootstrapped(_) => true, - Error::RegionNotFound(_) + Error::Grpc(_) + | Error::ClusterNotBootstrapped(_) + | Error::StreamDisconnect(_) + | Error::DataCompacted(_) => true, + Error::Other(_) + | Error::RegionNotFound(_) | Error::StoreTombstone(_) | Error::GlobalConfigNotFound(_) | Error::ClusterBootstrapped(_) @@ -47,9 +56,11 @@ impl ErrorCodeExt for Error { Error::ClusterNotBootstrapped(_) => error_code::pd::CLUSTER_NOT_BOOTSTRAPPED, Error::Incompatible => error_code::pd::INCOMPATIBLE, Error::Grpc(_) => error_code::pd::GRPC, + Error::StreamDisconnect(_) => error_code::pd::STREAM_DISCONNECT, Error::RegionNotFound(_) => error_code::pd::REGION_NOT_FOUND, Error::StoreTombstone(_) => error_code::pd::STORE_TOMBSTONE, Error::GlobalConfigNotFound(_) => error_code::pd::GLOBAL_CONFIG_NOT_FOUND, + Error::DataCompacted(_) => error_code::pd::DATA_COMPACTED, Error::Other(_) => error_code::pd::UNKNOWN, } } diff --git a/components/pd_client/src/feature_gate.rs b/components/pd_client/src/feature_gate.rs index 64ee3067585..dc8bef853de 100644 --- a/components/pd_client/src/feature_gate.rs +++ b/components/pd_client/src/feature_gate.rs @@ -7,8 +7,8 @@ use std::sync::{ use semver::{SemVerError, Version}; -/// The function assumes only major, minor and patch are considered, and they are -/// all less than u16::MAX, which is 65535. +/// The function assumes only major, minor and patch are considered, and they +/// are all less than u16::MAX, which is 65535. const fn ver_to_val(major: u64, minor: u64, patch: u64) -> u64 { major << 32 | minor << 16 | patch } @@ -45,8 +45,8 @@ impl FeatureGate { /// /// # Safety /// - /// Correctness in FeatureGate depends on monotonic increasing of version number, - /// should use `set_version` instead. + /// Correctness in FeatureGate depends on monotonic increasing of version + /// number, should use `set_version` instead. pub unsafe fn reset_version(&self, version: &str) -> Result<(), SemVerError> { let new = Version::parse(version)?; let val = ver_to_val(new.major, new.minor, new.patch); diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index c68a97f1dec..ba287621272 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -1,8 +1,12 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(let_chains)] + #[allow(unused_extern_crates)] extern crate tikv_alloc; mod client; +mod client_v2; mod feature_gate; pub mod metrics; mod tso; @@ -10,20 +14,21 @@ mod util; mod config; pub mod errors; -use std::{cmp::Ordering, collections::HashMap, ops::Deref, sync::Arc, time::Duration}; +pub mod meta_storage; +use std::{cmp::Ordering, ops::Deref, sync::Arc, time::Duration}; use futures::future::BoxFuture; -use grpcio::ClientSStreamReceiver; use kvproto::{ metapb, pdpb, replication_modepb::{RegionReplicationStatus, ReplicationStatus, StoreDrAutoSyncStatus}, }; -use pdpb::{QueryStats, WatchGlobalConfigResponse}; +use pdpb::QueryStats; use tikv_util::time::{Instant, UnixSecs}; use txn_types::TimeStamp; pub use self::{ - client::{DummyPdClient, RpcClient}, + client::RpcClient, + client_v2::{PdClient as PdClientV2, RpcClient as RpcClientV2}, config::Config, errors::{Error, Result}, feature_gate::{Feature, FeatureGate}, @@ -148,6 +153,33 @@ impl BucketStat { } } + pub fn from_meta(meta: Arc) -> Self { + let stats = new_bucket_stats(&meta); + Self::new(meta, stats) + } + + pub fn set_meta(&mut self, meta: Arc) { + self.stats = new_bucket_stats(&meta); + self.meta = meta; + } + + pub fn clear_stats(&mut self) { + self.stats = new_bucket_stats(&self.meta); + } + + pub fn merge(&mut self, delta: &BucketStat) { + merge_bucket_stats( + &self.meta.keys, + &mut self.stats, + &delta.meta.keys, + &delta.stats, + ); + } + + pub fn add_flows>(&mut self, incoming: &[I], delta_stats: &metapb::BucketStats) { + merge_bucket_stats(&self.meta.keys, &mut self.stats, incoming, delta_stats); + } + pub fn write_key(&mut self, key: &[u8], value_size: u64) { let idx = match util::find_bucket_index(key, &self.meta.keys) { Some(idx) => idx, @@ -196,6 +228,8 @@ impl BucketStat { } pub const INVALID_ID: u64 = 0; +// TODO: Implementation of config registration for each module +pub const RESOURCE_CONTROL_CONFIG_PATH: &str = "resource_group/settings"; /// PdClient communicates with Placement Driver (PD). /// Because now one PD only supports one cluster, so it is no need to pass @@ -204,17 +238,28 @@ pub const INVALID_ID: u64 = 0; /// all the time. pub trait PdClient: Send + Sync { /// Load a list of GlobalConfig - fn load_global_config(&self, _list: Vec) -> PdFuture> { + fn load_global_config( + &self, + _config_path: String, + ) -> PdFuture<(Vec, i64)> { unimplemented!(); } /// Store a list of GlobalConfig - fn store_global_config(&self, _list: HashMap) -> PdFuture<()> { + fn store_global_config( + &self, + _config_path: String, + _items: Vec, + ) -> PdFuture<()> { unimplemented!(); } /// Watching change of GlobalConfig - fn watch_global_config(&self) -> Result> { + fn watch_global_config( + &self, + _config_path: String, + _revision: i64, + ) -> Result> { unimplemented!(); } @@ -224,10 +269,10 @@ pub trait PdClient: Send + Sync { } /// Creates the cluster with cluster ID, node, stores and first Region. - /// If the cluster is already bootstrapped, return ClusterBootstrapped error. - /// When a node starts, if it finds nothing in the node and - /// cluster is not bootstrapped, it begins to create node, stores, first Region - /// and then call bootstrap_cluster to let PD know it. + /// If the cluster is already bootstrapped, return ClusterBootstrapped + /// error. When a node starts, if it finds nothing in the node and + /// cluster is not bootstrapped, it begins to create node, stores, first + /// Region and then call bootstrap_cluster to let PD know it. /// It may happen that multi nodes start at same time to try to /// bootstrap, but only one can succeed, while others will fail /// and must remove their created local Region data themselves. @@ -253,6 +298,18 @@ pub trait PdClient: Send + Sync { unimplemented!(); } + /// Returns whether the cluster is marked to start with snapshot recovery. + /// + /// Cluster is marked as recovering data before start up + /// Nomally, marker has been set by BR (from now), and tikv have to run in + /// recovery mode recovery mode will do + /// 1. update tikv cluster id from pd + /// 2. all peer apply the log to last of the leader peer which has the most + /// log appended. 3. delete data to some point of time (resolved_ts) + fn is_recovering_marked(&self) -> Result { + unimplemented!(); + } + /// Informs PD when the store starts or some store information changes. fn put_store(&self, _store: metapb::Store) -> Result> { unimplemented!(); @@ -263,11 +320,12 @@ pub trait PdClient: Send + Sync { /// - For bootstrapping, PD knows first Region with `bootstrap_cluster`. /// - For changing Peer, PD determines where to add a new Peer in some store /// for this Region. - /// - For Region splitting, PD determines the new Region id and Peer id for the - /// split Region. - /// - For Region merging, PD knows which two Regions will be merged and which Region - /// and Peers will be removed. - /// - For auto-balance, PD determines how to move the Region from one store to another. + /// - For Region splitting, PD determines the new Region id and Peer id for + /// the split Region. + /// - For Region merging, PD knows which two Regions will be merged and + /// which Region and Peers will be removed. + /// - For auto-balance, PD determines how to move the Region from one store + /// to another. /// Gets store information if it is not a tombstone store. fn get_store(&self, _store_id: u64) -> Result { @@ -315,6 +373,11 @@ pub trait PdClient: Send + Sync { unimplemented!(); } + // Gets Buckets by Region id. + fn get_buckets_by_id(&self, _region_id: u64) -> PdFuture> { + unimplemented!(); + } + /// Gets Region and its leader by Region id. fn get_region_leader_by_id( &self, @@ -380,7 +443,8 @@ pub trait PdClient: Send + Sync { unimplemented!(); } - /// Registers a handler to the client, which will be invoked after reconnecting to PD. + /// Registers a handler to the client, which will be invoked after + /// reconnecting to PD. /// /// Please note that this method should only be called once. fn handle_reconnect(&self, _: F) @@ -409,8 +473,9 @@ pub trait PdClient: Send + Sync { } /// Gets a batch of timestamps from PD. - /// Return a timestamp with (physical, logical), indicating that timestamps allocated are: - /// [Timestamp(physical, logical - count + 1), Timestamp(physical, logical)] + /// Return a timestamp with (physical, logical), indicating that timestamps + /// allocated are: [Timestamp(physical, logical - count + 1), + /// Timestamp(physical, logical)] fn batch_get_tso(&self, _count: u32) -> PdFuture { unimplemented!() } diff --git a/components/pd_client/src/meta_storage.rs b/components/pd_client/src/meta_storage.rs new file mode 100644 index 00000000000..109986665bd --- /dev/null +++ b/components/pd_client/src/meta_storage.rs @@ -0,0 +1,302 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! `meta_storage` is the API set for storing generic KV pairs. +//! It is a trimmed version of the KV service of etcd, along with some metrics. + +use std::{pin::Pin, sync::Arc, task::ready}; + +use futures::{FutureExt, Stream}; +use kvproto::meta_storagepb as pb; +use tikv_util::{box_err, codec}; + +use crate::{Error, PdFuture, Result}; + +/// The etcd INF end key. +/// Unlike TiKV, they have chosen the slice `[0u8]` as the infinity. +const INF: [u8; 1] = [0u8]; + +/// A Get request to the meta storage. +#[derive(Clone, Debug)] +pub struct Get { + pub(crate) inner: pb::GetRequest, +} + +impl From for pb::GetRequest { + fn from(value: Get) -> Self { + value.inner + } +} + +impl Get { + /// Create a new get request, querying for exactly one key. + pub fn of(key: impl Into>) -> Self { + let mut inner = pb::GetRequest::default(); + inner.set_key(key.into()); + Self { inner } + } + + /// Enhance the query, make it be able to query the prefix of keys. + /// The prefix is the key passed to the method [`of`](Get::of). + pub fn prefixed(mut self) -> Self { + let mut next = codec::next_prefix_of(self.inner.key.clone()); + if next.is_empty() { + next = INF.to_vec(); + } + self.inner.set_range_end(next); + self + } + + /// Enhance the query, make it be able to query a range of keys. + /// The prefix is the key passed to the method [`of`](Get::of). + pub fn range_to(mut self, to: impl Into>) -> Self { + self.inner.set_range_end(to.into()); + self + } + + /// Specify the revision of the query. + pub fn rev(mut self, rev: i64) -> Self { + self.inner.set_revision(rev); + self + } + + pub fn limit(mut self, limit: i64) -> Self { + self.inner.set_limit(limit); + self + } +} + +/// A Put request to the meta store. +#[derive(Clone, Debug)] +pub struct Put { + pub(crate) inner: pb::PutRequest, +} + +impl Put { + /// Create a put request of the key value. + pub fn of(key: impl Into>, value: impl Into>) -> Self { + let mut inner = pb::PutRequest::default(); + inner.set_key(key.into()); + inner.set_value(value.into()); + Self { inner } + } + + /// Enhance the put request, allow it to return the previous kv pair. + pub fn fetch_prev_kv(mut self) -> Self { + self.inner.prev_kv = true; + self + } +} + +impl From for pb::PutRequest { + fn from(value: Put) -> Self { + value.inner + } +} + +#[derive(Clone, Debug)] +pub struct Watch { + pub(crate) inner: pb::WatchRequest, +} + +impl Watch { + /// Create a watch request for a key. + pub fn of(key: impl Into>) -> Self { + let mut inner = pb::WatchRequest::default(); + inner.set_key(key.into()); + + Self { inner } + } + + /// Enhance the request to allow it watch keys with the same prefix. + pub fn prefixed(mut self) -> Self { + let mut next = codec::next_prefix_of(self.inner.key.clone()); + if next.is_empty() { + next = INF.to_vec(); + } + self.inner.set_range_end(next); + self + } + + /// Enhance the request to allow it watch keys until the range end. + pub fn range_to(mut self, to: impl Into>) -> Self { + self.inner.set_range_end(to.into()); + self + } + + /// Enhance the request to make it watch from a specified revision. + pub fn from_rev(mut self, rev: i64) -> Self { + self.inner.set_start_revision(rev); + self + } +} + +impl From for pb::WatchRequest { + fn from(value: Watch) -> Self { + value.inner + } +} + +/// The descriptor of source (caller) of the requests. +#[derive(Clone, Copy)] +pub enum Source { + LogBackup = 0, +} + +impl std::fmt::Display for Source { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Source::LogBackup => f.write_str("log_backup"), + } + } +} + +/// A wrapper over client which would fill the source field in the header for +/// all requests. +#[derive(Clone)] +pub struct Sourced { + inner: S, + source: Source, +} + +impl Sourced { + pub fn new(inner: S, source: Source) -> Self { + Self { inner, source } + } + + fn prepare_header(&self, h: &mut pb::RequestHeader) { + h.set_source(self.source.to_string()); + } +} + +impl MetaStorageClient for Sourced { + type WatchStream = S::WatchStream; + + fn get(&self, mut req: Get) -> PdFuture { + self.prepare_header(req.inner.mut_header()); + self.inner.get(req) + } + + fn put(&self, mut req: Put) -> PdFuture { + self.prepare_header(req.inner.mut_header()); + self.inner.put(req) + } + + fn watch(&self, mut req: Watch) -> Self::WatchStream { + self.prepare_header(req.inner.mut_header()); + self.inner.watch(req) + } +} + +/// A wrapper that makes every response and stream event get checked. +/// When there is an error in the header, this client would return a [`Err`] +/// variant directly. +#[derive(Clone)] +pub struct Checked(S); + +impl Checked { + pub fn new(client: S) -> Self { + Self(client) + } +} + +/// A wrapper that checks every event in the stream and returns an error +/// variant when there is error in the header. +pub struct CheckedStream(S); + +fn check_resp_header(header: &pb::ResponseHeader) -> Result<()> { + if header.has_error() { + match header.get_error().get_type() { + pb::ErrorType::Ok => Ok(()), + pb::ErrorType::Unknown => Err(Error::Other(box_err!( + "{}", + header.get_error().get_message() + ))), + pb::ErrorType::DataCompacted => Err(Error::DataCompacted( + header.get_error().get_message().to_owned(), + )), + }?; + } + Ok(()) +} + +impl>> Stream for CheckedStream { + type Item = Result; + + fn poll_next( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + // SAFETY: trivial projection. + let inner = unsafe { Pin::new_unchecked(&mut self.get_unchecked_mut().0) }; + let item = ready!(inner.poll_next(cx)); + item.map(|r| { + r.and_then(|resp| { + check_resp_header(resp.get_header())?; + Ok(resp) + }) + }) + .into() + } +} + +impl MetaStorageClient for Checked { + type WatchStream = CheckedStream; + + fn get(&self, req: Get) -> PdFuture { + self.0 + .get(req) + .map(|resp| { + resp.and_then(|r| { + check_resp_header(r.get_header())?; + Ok(r) + }) + }) + .boxed() + } + + fn put(&self, req: Put) -> PdFuture { + self.0 + .put(req) + .map(|resp| { + resp.and_then(|r| { + check_resp_header(r.get_header())?; + Ok(r) + }) + }) + .boxed() + } + + fn watch(&self, req: Watch) -> Self::WatchStream { + CheckedStream(self.0.watch(req)) + } +} + +impl MetaStorageClient for Arc { + type WatchStream = S::WatchStream; + + fn get(&self, req: Get) -> PdFuture { + Arc::as_ref(self).get(req) + } + + fn put(&self, req: Put) -> PdFuture { + Arc::as_ref(self).put(req) + } + + fn watch(&self, req: Watch) -> Self::WatchStream { + Arc::as_ref(self).watch(req) + } +} + +/// A client which is able to play with the `meta_storage` service. +pub trait MetaStorageClient: Send + Sync + 'static { + // Note: Perhaps we'd better make it generic over response here, however that + // would make `CheckedStream` impossible(How can we check ALL types? Or we may + // make traits like `MetaStorageResponse` and constraint over the T), thankfully + // there is only one streaming RPC in this service. + /// The stream that yielded by the watch RPC. + type WatchStream: Stream>; + + fn get(&self, req: Get) -> PdFuture; + fn put(&self, req: Put) -> PdFuture; + fn watch(&self, req: Watch) -> Self::WatchStream; +} diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index 57879a57d0e..e1f1100444a 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,14 +2,56 @@ use lazy_static::lazy_static; use prometheus::*; +use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; + +make_static_metric! { + pub label_enum PDRequestEventType { + get_region, + get_region_by_id, + get_region_leader_by_id, + scatter_region, + get_store, + get_store_async, + put_store, + get_all_stores, + get_store_and_stats, + store_global_config, + load_global_config, + watch_global_config, + bootstrap_cluster, + is_cluster_bootstrapped, + get_cluster_config, + ask_split, + ask_batch_split, + report_batch_split, + get_gc_safe_point, + update_service_safe_point, + min_resolved_ts, + get_operator, + alloc_id, + is_recovering_marked, + store_heartbeat, + tso, + + meta_storage_put, + meta_storage_get, + meta_storage_watch, + } + + pub struct PDRequestEventHistogramVec: Histogram { + "type" => PDRequestEventType, + } +} lazy_static! { - pub static ref PD_REQUEST_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( - "tikv_pd_request_duration_seconds", - "Bucketed histogram of PD requests duration", - &["type"] - ) - .unwrap(); + pub static ref PD_REQUEST_HISTOGRAM_VEC: PDRequestEventHistogramVec = + register_static_histogram_vec!( + PDRequestEventHistogramVec, + "tikv_pd_request_duration_seconds", + "Bucketed histogram of PD requests duration", + &["type"] + ) + .unwrap(); pub static ref PD_HEARTBEAT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_pd_heartbeat_message_total", "Total number of PD heartbeat messages.", diff --git a/components/pd_client/src/tso.rs b/components/pd_client/src/tso.rs index ff951a3c77c..feec5061a8c 100644 --- a/components/pd_client/src/tso.rs +++ b/components/pd_client/src/tso.rs @@ -3,13 +3,15 @@ //! This module is the low-level mechanisms for getting timestamps from a PD //! cluster. It should be used via the `get_tso` API in `PdClient`. //! -//! Once a `TimestampOracle` is created, there will be two futures running in a background working -//! thread created automatically. The `get_timestamp` method creates a oneshot channel whose -//! transmitter is served as a `TimestampRequest`. `TimestampRequest`s are sent to the working -//! thread through a bounded multi-producer, single-consumer channel. Every time the first future -//! is polled, it tries to exhaust the channel to get as many requests as possible and sends a -//! single `TsoRequest` to the PD server. The other future receives `TsoResponse`s from the PD -//! server and allocates timestamps for the requests. +//! Once a `TimestampOracle` is created, there will be two futures running in a +//! background working thread created automatically. The `get_timestamp` method +//! creates a oneshot channel whose transmitter is served as a +//! `TimestampRequest`. `TimestampRequest`s are sent to the working thread +//! through a bounded multi-producer, single-consumer channel. Every time the +//! first future is polled, it tries to exhaust the channel to get as many +//! requests as possible and sends a single `TsoRequest` to the PD server. The +//! other future receives `TsoResponse`s from the PD server and allocates +//! timestamps for the requests. use std::{cell::RefCell, collections::VecDeque, pin::Pin, rc::Rc, thread}; @@ -21,7 +23,7 @@ use futures::{ }; use grpcio::{CallOption, WriteFlags}; use kvproto::pdpb::{PdClient, TsoRequest, TsoResponse}; -use tikv_util::{box_err, info}; +use tikv_util::{box_err, info, sys::thread::StdThreadBuildWrapper}; use tokio::sync::{mpsc, oneshot, watch}; use txn_types::TimeStamp; @@ -37,13 +39,14 @@ struct TimestampRequest { count: u32, } -/// The timestamp oracle (TSO) which provides monotonically increasing timestamps. +/// The timestamp oracle (TSO) which provides monotonically increasing +/// timestamps. pub struct TimestampOracle { - /// The transmitter of a bounded channel which transports requests of getting a single - /// timestamp to the TSO working thread. A bounded channel is used to prevent using - /// too much memory unexpectedly. - /// In the working thread, the `TimestampRequest`, which is actually a one channel sender, - /// is used to send back the timestamp result. + /// The transmitter of a bounded channel which transports requests of + /// getting a single timestamp to the TSO working thread. A bounded + /// channel is used to prevent using too much memory unexpectedly. + /// In the working thread, the `TimestampRequest`, which is actually a one + /// channel sender, is used to send back the timestamp result. request_tx: mpsc::Sender, close_rx: watch::Receiver<()>, } @@ -61,7 +64,7 @@ impl TimestampOracle { // Start a background thread to handle TSO requests and responses thread::Builder::new() .name("tso-worker".into()) - .spawn(move || { + .spawn_wrapper(move || { block_on(run_tso( cluster_id, rpc_sender.sink_err_into(), @@ -113,12 +116,14 @@ async fn run_tso( mut request_rx: mpsc::Receiver, close_tx: watch::Sender<()>, ) { - // The `TimestampRequest`s which are waiting for the responses from the PD server + // The `TimestampRequest`s which are waiting for the responses from the PD + // server let pending_requests = Rc::new(RefCell::new(VecDeque::with_capacity(MAX_PENDING_COUNT))); - // When there are too many pending requests, the `send_request` future will refuse to fetch - // more requests from the bounded channel. This waker is used to wake up the sending future - // if the queue containing pending requests is no longer full. + // When there are too many pending requests, the `send_request` future will + // refuse to fetch more requests from the bounded channel. This waker is + // used to wake up the sending future if the queue containing pending + // requests is no longer full. let sending_future_waker = Rc::new(AtomicWaker::new()); let mut request_stream = TsoRequestStream { @@ -139,8 +144,8 @@ async fn run_tso( while let Some(Ok(resp)) = rpc_receiver.next().await { let mut pending_requests = pending_requests.borrow_mut(); - // Wake up the sending future blocked by too many pending requests as we are consuming - // some of them here. + // Wake up the sending future blocked by too many pending requests as we are + // consuming some of them here. if pending_requests.len() >= MAX_PENDING_COUNT { sending_future_waker.wake(); } @@ -175,40 +180,41 @@ impl<'a> Stream for TsoRequestStream<'a> { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let pending_requests = self.pending_requests.clone(); let mut pending_requests = pending_requests.borrow_mut(); - let mut requests = Vec::new(); - while requests.len() < MAX_BATCH_SIZE && pending_requests.len() < MAX_PENDING_COUNT { - match self.request_rx.poll_recv(cx) { - Poll::Ready(Some(sender)) => { - requests.push(sender); + if pending_requests.len() < MAX_PENDING_COUNT { + let mut requests = Vec::new(); + while requests.len() < MAX_BATCH_SIZE { + match self.request_rx.poll_recv(cx) { + Poll::Ready(Some(sender)) => { + requests.push(sender); + } + Poll::Ready(None) if requests.is_empty() => { + return Poll::Ready(None); + } + _ => break, } - Poll::Ready(None) if requests.is_empty() => { - return Poll::Ready(None); - } - _ => break, + } + if !requests.is_empty() { + let mut req = TsoRequest::default(); + req.mut_header().cluster_id = self.cluster_id; + req.count = requests.iter().map(|r| r.count).sum(); + + let request_group = RequestGroup { + tso_request: req.clone(), + requests, + }; + pending_requests.push_back(request_group); + PD_PENDING_TSO_REQUEST_GAUGE.set(pending_requests.len() as i64); + + let write_flags = WriteFlags::default().buffer_hint(false); + return Poll::Ready(Some((req, write_flags))); } } - if !requests.is_empty() { - let mut req = TsoRequest::default(); - req.mut_header().cluster_id = self.cluster_id; - req.count = requests.iter().map(|r| r.count).sum(); - - let request_group = RequestGroup { - tso_request: req.clone(), - requests, - }; - pending_requests.push_back(request_group); - PD_PENDING_TSO_REQUEST_GAUGE.set(pending_requests.len() as i64); - - let write_flags = WriteFlags::default().buffer_hint(false); - Poll::Ready(Some((req, write_flags))) - } else { - // Set the waker to the context, then the stream can be waked up after the pending queue - // is no longer full. - self.self_waker.register(cx.waker()); - Poll::Pending - } + // Set the waker to the context, then the stream can be waked up after the + // pending queue is no longer full. + self.self_waker.register(cx.waker()); + Poll::Pending } } @@ -216,9 +222,9 @@ fn allocate_timestamps( resp: &TsoResponse, pending_requests: &mut VecDeque, ) -> Result<()> { - // PD returns the timestamp with the biggest logical value. We can send back timestamps - // whose logical value is from `logical - count + 1` to `logical` using the senders - // in `pending`. + // PD returns the timestamp with the biggest logical value. We can send back + // timestamps whose logical value is from `logical - count + 1` to `logical` + // using the senders in `pending`. let tail_ts = resp .timestamp .as_ref() diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 5ec629aacdb..f3a8451f321 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -22,6 +22,7 @@ use grpcio::{ Environment, Error::RpcFailure, MetadataBuilder, Result as GrpcResult, RpcStatusCode, }; use kvproto::{ + meta_storagepb::MetaStorageClient as MetaStorageStub, metapb::BucketStats, pdpb::{ ErrorType, GetMembersRequest, GetMembersResponse, Member, PdClient as PdClientStub, @@ -43,20 +44,22 @@ use super::{ const RETRY_INTERVAL: Duration = Duration::from_secs(1); // 1s const MAX_RETRY_TIMES: u64 = 5; -// The max duration when retrying to connect to leader. No matter if the MAX_RETRY_TIMES is reached. +// The max duration when retrying to connect to leader. No matter if the +// MAX_RETRY_TIMES is reached. const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); // FIXME: Use a request-independent way to handle reconnection. const GLOBAL_RECONNECT_INTERVAL: Duration = Duration::from_millis(100); // 0.1s pub const REQUEST_RECONNECT_INTERVAL: Duration = Duration::from_secs(1); // 1s +#[derive(Clone)] pub struct TargetInfo { target_url: String, via: String, } impl TargetInfo { - fn new(target_url: String, via: &str) -> TargetInfo { + pub(crate) fn new(target_url: String, via: &str) -> TargetInfo { TargetInfo { target_url, via: trim_http_prefix(via).to_string(), @@ -102,6 +105,7 @@ pub struct Inner { pub pending_heartbeat: Arc, pub pending_buckets: Arc, pub tso: TimestampOracle, + pub meta_storage: MetaStorageStub, last_try_reconnect: Instant, } @@ -179,6 +183,8 @@ impl Client { let (buckets_tx, buckets_resp) = client_stub .report_buckets_opt(target.call_option()) .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "report_buckets", e)); + let meta_storage = + kvproto::meta_storagepb::MetaStorageClient::new(client_stub.client.channel().clone()); Client { timer: GLOBAL_TIMER_HANDLE.clone(), inner: RwLock::new(Inner { @@ -196,6 +202,7 @@ impl Client { pending_buckets: Arc::default(), last_try_reconnect: Instant::now(), tso, + meta_storage, }), feature_gate: FeatureGate::default(), enable_forwarding, @@ -236,6 +243,7 @@ impl Client { inner.buckets_sender = Either::Left(Some(buckets_tx)); inner.buckets_resp = Some(buckets_resp); + inner.meta_storage = MetaStorageStub::new(client_stub.client.channel().clone()); inner.client_stub = client_stub; inner.members = members; inner.tso = tso; @@ -317,7 +325,8 @@ impl Client { /// Re-establishes connection with PD leader in asynchronized fashion. /// /// If `force` is false, it will reconnect only when members change. - /// Note: Retrying too quickly will return an error due to cancellation. Please always try to reconnect after sending the request first. + /// Note: Retrying too quickly will return an error due to cancellation. + /// Please always try to reconnect after sending the request first. pub async fn reconnect(&self, force: bool) -> Result<()> { PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); let start = Instant::now(); @@ -338,7 +347,13 @@ impl Client { async move { let direct_connected = self.inner.rl().target_info().direct_connected(); connector - .reconnect_pd(members, direct_connected, force, self.enable_forwarding) + .reconnect_pd( + members, + direct_connected, + force, + self.enable_forwarding, + true, + ) .await } }; @@ -381,7 +396,7 @@ impl Client { fail_point!("pd_client_reconnect", |_| Ok(())); - self.update_client(client, target_info, members, tso); + self.update_client(client, target_info, members, tso.unwrap()); info!("trying to update PD client done"; "spend" => ?start.saturating_elapsed()); Ok(()) } @@ -470,18 +485,30 @@ where } } +pub fn call_option_inner(inner: &Inner) -> CallOption { + inner + .target_info() + .call_option() + .timeout(Duration::from_secs(REQUEST_TIMEOUT)) +} + /// Do a request in synchronized fashion. pub fn sync_request(client: &Client, mut retry: usize, func: F) -> Result where - F: Fn(&PdClientStub) -> GrpcResult, + F: Fn(&PdClientStub, CallOption) -> GrpcResult, { loop { let ret = { - // Drop the read lock immediately to prevent the deadlock between the caller thread - // which may hold the read lock and wait for PD client thread completing the request - // and the PD client thread which may block on acquiring the write lock. - let client_stub = client.inner.rl().client_stub.clone(); - func(&client_stub).map_err(Error::Grpc) + // Drop the read lock immediately to prevent the deadlock between the caller + // thread which may hold the read lock and wait for PD client thread + // completing the request and the PD client thread which may block + // on acquiring the write lock. + let (client_stub, option) = { + let inner = client.inner.rl(); + (inner.client_stub.clone(), call_option_inner(&inner)) + }; + + func(&client_stub, option).map_err(Error::Grpc) }; match ret { Ok(r) => { @@ -507,11 +534,13 @@ pub type StubTuple = ( PdClientStub, TargetInfo, GetMembersResponse, - TimestampOracle, + // Only used by RpcClient, not by RpcClientV2. + Option, ); +#[derive(Clone)] pub struct PdConnector { - env: Arc, + pub(crate) env: Arc, security_mgr: Arc, } @@ -520,7 +549,7 @@ impl PdConnector { PdConnector { env, security_mgr } } - pub async fn validate_endpoints(&self, cfg: &Config) -> Result { + pub async fn validate_endpoints(&self, cfg: &Config, build_tso: bool) -> Result { let len = cfg.endpoints.len(); let mut endpoints_set = HashSet::with_capacity_and_hasher(len, Default::default()); let mut members = None; @@ -561,7 +590,7 @@ impl PdConnector { match members { Some(members) => { let res = self - .reconnect_pd(members, true, true, cfg.enable_forwarding) + .reconnect_pd(members, true, true, cfg.enable_forwarding, build_tso) .await? .unwrap(); info!("all PD endpoints are consistent"; "endpoints" => ?cfg.endpoints); @@ -579,10 +608,18 @@ impl PdConnector { .max_send_message_len(-1) .max_receive_message_len(-1) .keepalive_time(Duration::from_secs(10)) - .keepalive_timeout(Duration::from_secs(3)); + .keepalive_timeout(Duration::from_secs(3)) + .max_reconnect_backoff(Duration::from_secs(5)) + .initial_reconnect_backoff(Duration::from_secs(1)); self.security_mgr.connect(cb, addr_trim) }; - let client = PdClientStub::new(channel); + fail_point!("cluster_id_is_not_ready", |_| { + Ok(( + PdClientStub::new(channel.clone()), + GetMembersResponse::default(), + )) + }); + let client = PdClientStub::new(channel.clone()); let option = CallOption::default().timeout(Duration::from_secs(REQUEST_TIMEOUT)); let response = client .get_members_async_opt(&GetMembersRequest::default(), option) @@ -594,6 +631,13 @@ impl PdConnector { } } + // load_members returns the PD members by calling getMember, there are two + // abnormal scenes for the reponse: + // 1. header has an error: the PD is not ready to serve. + // 2. cluster id is zero: etcd start server but the follower did not get + // cluster id yet. + // In this case, load_members should return an error, so the client + // will not update client address. pub async fn load_members(&self, previous: &GetMembersResponse) -> Result { let previous_leader = previous.get_leader(); let members = previous.get_members(); @@ -608,17 +652,30 @@ impl PdConnector { for ep in m.get_client_urls() { match self.connect(ep.as_str()).await { Ok((_, r)) => { - let new_cluster_id = r.get_header().get_cluster_id(); - if new_cluster_id == cluster_id { - // check whether the response have leader info, otherwise continue to loop the rest members - if r.has_leader() { - return Ok(r); - } + let header = r.get_header(); + // Try next follower endpoint if the cluster has not ready since this pr: + // pd#5412. + if let Err(e) = check_resp_header(header) { + error!("connect pd failed";"endpoints" => ep, "error" => ?e); } else { - panic!( - "{} no longer belongs to cluster {}, it is in {}", - ep, cluster_id, new_cluster_id - ); + let new_cluster_id = header.get_cluster_id(); + // it is new cluster if the new cluster id is zero. + if cluster_id == 0 || new_cluster_id == cluster_id { + // check whether the response have leader info, otherwise continue + // to loop the rest members + if r.has_leader() { + return Ok(r); + } + // Try next endpoint if PD server returns the + // cluster id is zero without any error. + } else if new_cluster_id == 0 { + error!("{} connect success, but cluster id is not ready", ep); + } else { + panic!( + "{} no longer belongs to cluster {}, it is in {}", + ep, cluster_id, new_cluster_id + ); + } } } Err(e) => { @@ -635,15 +692,18 @@ impl PdConnector { } // There are 3 kinds of situations we will return the new client: - // 1. the force is true which represents the client is newly created or the original connection has some problem - // 2. the previous forwarded host is not empty and it can connect the leader now which represents the network partition problem to leader may be recovered - // 3. the member information of PD has been changed - async fn reconnect_pd( + // 1. the force is true which represents the client is newly created or the + // original connection has some problem 2. the previous forwarded host is + // not empty and it can connect the leader now which represents the network + // partition problem to leader may be recovered 3. the member information of + // PD has been changed + pub async fn reconnect_pd( &self, members_resp: GetMembersResponse, direct_connected: bool, force: bool, enable_forwarding: bool, + build_tso: bool, ) -> Result> { let resp = self.load_members(&members_resp).await?; let leader = resp.get_leader(); @@ -657,11 +717,15 @@ impl PdConnector { match res { Some((client, target_url)) => { let info = TargetInfo::new(target_url, ""); - let tso = TimestampOracle::new( - resp.get_header().get_cluster_id(), - &client, - info.call_option(), - )?; + let tso = if build_tso { + Some(TimestampOracle::new( + resp.get_header().get_cluster_id(), + &client, + info.call_option(), + )?) + } else { + None + }; return Ok(Some((client, info, resp, tso))); } None => { @@ -672,11 +736,15 @@ impl PdConnector { } if enable_forwarding && has_network_error { if let Ok(Some((client, info))) = self.try_forward(members, leader).await { - let tso = TimestampOracle::new( - resp.get_header().get_cluster_id(), - &client, - info.call_option(), - )?; + let tso = if build_tso { + Some(TimestampOracle::new( + resp.get_header().get_cluster_id(), + &client, + info.call_option(), + )?) + } else { + None + }; return Ok(Some((client, info, resp, tso))); } } @@ -732,7 +800,9 @@ impl PdConnector { loop { let (res, has_network_err) = self.connect_member(leader).await?; match res { - Some((client, ep, _)) => return Ok((Some((client, ep)), has_network_err)), + Some((client, ep, _)) => { + return Ok((Some((client, ep)), has_network_err)); + } None => { if has_network_err && retry_times > 0 @@ -806,11 +876,14 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::IncompatibleVersion => Err(Error::Incompatible), ErrorType::StoreTombstone => Err(Error::StoreTombstone(err.get_message().to_owned())), ErrorType::RegionNotFound => Err(Error::RegionNotFound(vec![])), - ErrorType::Unknown => Err(box_err!(err.get_message())), ErrorType::GlobalConfigNotFound => { Err(Error::GlobalConfigNotFound(err.get_message().to_owned())) } + ErrorType::DataCompacted => Err(Error::DataCompacted(err.get_message().to_owned())), ErrorType::Ok => Ok(()), + ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), + ErrorType::Unknown => Err(box_err!(err.get_message())), + ErrorType::InvalidValue => Err(box_err!(err.get_message())), } } @@ -844,8 +917,9 @@ pub fn find_bucket_index>(key: &[u8], bucket_keys: &[S]) -> Optio ) } -/// Merge incoming bucket stats. If a range in new buckets overlaps with multiple ranges in -/// current buckets, stats of the new range will be added to all stats of current ranges. +/// Merge incoming bucket stats. If a range in new buckets overlaps with +/// multiple ranges in current buckets, stats of the new range will be added to +/// all stats of current ranges. pub fn merge_bucket_stats, I: AsRef<[u8]>>( cur: &[C], cur_stats: &mut BucketStats, diff --git a/components/profiler/Cargo.toml b/components/profiler/Cargo.toml index f0879722b1b..e5583a631d5 100644 --- a/components/profiler/Cargo.toml +++ b/components/profiler/Cargo.toml @@ -8,7 +8,7 @@ publish = false profiling = ["lazy_static", "gperftools", "callgrind", "valgrind_request"] [dependencies] -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } [target.'cfg(unix)'.dependencies] lazy_static = { version = "1.3.0", optional = true } @@ -18,4 +18,5 @@ valgrind_request = { version = "1.1.0", optional = true } [[example]] name = "prime" +path = "examples/prime.rs" required-features = ["profiling"] diff --git a/components/profiler/examples/prime.rs b/components/profiler/examples/prime.rs index fa54b2b2658..ede351acea5 100644 --- a/components/profiler/examples/prime.rs +++ b/components/profiler/examples/prime.rs @@ -24,7 +24,8 @@ //! valgrind --tool=callgrind --instr-atstart=no ../../target/debug/examples/prime //! ``` //! -//! You must not run example via `valgrind cargo run ...`. The framework won't detect Callgrind! +//! You must not run example via `valgrind cargo run ...`. The framework won't +//! detect Callgrind! #[inline(never)] fn is_prime_number(v: usize, prime_numbers: &[usize]) -> bool { diff --git a/components/profiler/src/lib.rs b/components/profiler/src/lib.rs index e3ea0d43a6a..2734d8f7877 100644 --- a/components/profiler/src/lib.rs +++ b/components/profiler/src/lib.rs @@ -30,11 +30,12 @@ //! //! Then, compile the code with `profiling` feature enabled. //! -//! By default, a profile called `app.profile` will be generated by CPU Profiler. -//! You can then analyze the profile using [pprof](https://github.com/google/pprof). +//! By default, a profile called `app.profile` will be generated by CPU +//! Profiler. You can then analyze the profile using [pprof](https://github.com/google/pprof). //! -//! If the application is running in Callgrind, a Callgrind profile dump will be generated instead. -//! Notice that you should run Callgrind with command line option `--instr-atstart=no`, e.g.: +//! If the application is running in Callgrind, a Callgrind profile dump will be +//! generated instead. Notice that you should run Callgrind with command line +//! option `--instr-atstart=no`, e.g.: //! //! ```bash //! valgrind --tool=callgrind --instr-atstart=no ./my_example diff --git a/components/profiler/src/profiler_unix.rs b/components/profiler/src/profiler_unix.rs index 822b89619a9..c53f32b3b44 100644 --- a/components/profiler/src/profiler_unix.rs +++ b/components/profiler/src/profiler_unix.rs @@ -16,14 +16,15 @@ lazy_static::lazy_static! { static ref ACTIVE_PROFILER: Mutex = Mutex::new(Profiler::None); } -/// Start profiling. Returns false if failed, i.e. there is already a profiling in progress. +/// Start profiling. Returns false if failed, i.e. there is already a profiling +/// in progress. /// -/// When `profiling` feature is not enabled, this function will do nothing and there is totally -/// zero cost. +/// When `profiling` feature is not enabled, this function will do nothing and +/// there is totally zero cost. /// /// When running in Callgrind, Callgrind instrumentation will be started -/// (`CALLGRIND_START_INSTRUMENTATION`). Otherwise, the CPU Profiler will be started and profile -/// will be generated to the file specified by `name`. +/// (`CALLGRIND_START_INSTRUMENTATION`). Otherwise, the CPU Profiler will be +/// started and profile will be generated to the file specified by `name`. // TODO: Better multi-thread support. #[inline] pub fn start(name: impl AsRef) -> bool { @@ -49,10 +50,11 @@ pub fn start(name: impl AsRef) -> bool { true } -/// Stop profiling. Returns false if failed, i.e. there is no profiling in progress. +/// Stop profiling. Returns false if failed, i.e. there is no profiling in +/// progress. /// -/// When `profiling` feature is not enabled, this function will do nothing and there is totally -/// zero cost. +/// When `profiling` feature is not enabled, this function will do nothing and +/// there is totally zero cost. #[inline] pub fn stop() -> bool { let mut profiler = ACTIVE_PROFILER.lock().unwrap(); diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 5df8d5f3852..cbccea9dbe0 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -5,19 +5,24 @@ publish = false edition = "2018" [dependencies] -encryption = { path = "../encryption" } -engine_traits = { path = "../engine_traits", default-features = false } -file_system = { path = "../file_system" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +encryption = { workspace = true } +engine_traits = { workspace = true } +codec = { workspace = true } +file_system = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.4.0" num_cpus = "1" -online_config = { path = "../online_config" } +online_config = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-engine = { git = "https://github.com/tikv/raft-engine.git", features = ["swap"] } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +slog = { workspace = true } +slog-global = { workspace = true } +tikv_util = { workspace = true } time = "0.1" +tracker = { workspace = true } + +[dev-dependencies] +tempfile = "3.0" diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 145a122802d..621d708b057 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -7,21 +7,33 @@ use std::{ sync::Arc, }; +use codec::number::NumberCodec; use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ - CacheStats, EncryptionKeyManager, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, - RaftLogBatch as RaftLogBatchTrait, RaftLogGCTask, Result, + CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, + RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, Result, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, +}; +use file_system::{IoOp, IoRateLimiter, IoType, WithIoType}; +use kvproto::{ + metapb::Region, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, }; -use file_system::{IOOp, IORateLimiter, IOType}; -use kvproto::raft_serverpb::RaftLocalState; use raft::eraftpb::Entry; use raft_engine::{ - env::{DefaultFileSystem, FileSystem, Handle, WriteExt}, + env::{DefaultFileSystem, FileSystem, Handle, Permission, WriteExt}, Command, Engine as RawRaftEngine, Error as RaftEngineError, LogBatch, MessageExt, }; pub use raft_engine::{Config as RaftEngineConfig, ReadableSize, RecoveryMode}; use tikv_util::Either; +use crate::perf_context::RaftEnginePerfContext; + +// A special region ID representing store state. +const STORE_STATE_ID: u64 = 0; + #[derive(Clone)] pub struct MessageExtTyped; @@ -33,12 +45,12 @@ impl MessageExt for MessageExtTyped { } } -struct ManagedReader { +pub struct ManagedReader { inner: Either< ::Reader, DecrypterReader<::Reader>, >, - rate_limiter: Option>, + rate_limiter: Option>, } impl Seek for ManagedReader { @@ -54,7 +66,8 @@ impl Read for ManagedReader { fn read(&mut self, buf: &mut [u8]) -> IoResult { let mut size = buf.len(); if let Some(ref mut limiter) = self.rate_limiter { - size = limiter.request(IOType::ForegroundRead, IOOp::Read, size); + let io_type = file_system::get_io_type(); + size = limiter.request(io_type, IoOp::Read, size); } match self.inner.as_mut() { Either::Left(reader) => reader.read(&mut buf[..size]), @@ -63,12 +76,12 @@ impl Read for ManagedReader { } } -struct ManagedWriter { +pub struct ManagedWriter { inner: Either< ::Writer, EncrypterWriter<::Writer>, >, - rate_limiter: Option>, + rate_limiter: Option>, } impl Seek for ManagedWriter { @@ -84,7 +97,8 @@ impl Write for ManagedWriter { fn write(&mut self, buf: &[u8]) -> IoResult { let mut size = buf.len(); if let Some(ref mut limiter) = self.rate_limiter { - size = limiter.request(IOType::ForegroundWrite, IOOp::Write, size); + let io_type = file_system::get_io_type(); + size = limiter.request(io_type, IoOp::Write, size); } match self.inner.as_mut() { Either::Left(writer) => writer.write(&buf[..size]), @@ -106,13 +120,6 @@ impl WriteExt for ManagedWriter { } } - fn sync(&mut self) -> IoResult<()> { - match self.inner.as_mut() { - Either::Left(writer) => writer.sync(), - Either::Right(writer) => writer.inner_mut().sync(), - } - } - fn allocate(&mut self, offset: usize, size: usize) -> IoResult<()> { match self.inner.as_mut() { Either::Left(writer) => writer.allocate(offset, size), @@ -121,26 +128,26 @@ impl WriteExt for ManagedWriter { } } -struct ManagedFileSystem { - base_level_file_system: DefaultFileSystem, +pub struct ManagedFileSystem { + base_file_system: DefaultFileSystem, key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, } impl ManagedFileSystem { - fn new( + pub fn new( key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, ) -> Self { Self { - base_level_file_system: DefaultFileSystem, + base_file_system: DefaultFileSystem, key_manager, rate_limiter, } } } -struct ManagedHandle { +pub struct ManagedHandle { path: PathBuf, base: Arc<::Handle>, } @@ -153,6 +160,10 @@ impl Handle for ManagedHandle { fn file_size(&self) -> IoResult { self.base.file_size() } + + fn sync(&self) -> IoResult<()> { + self.base.sync() + } } impl FileSystem for ManagedFileSystem { @@ -161,7 +172,7 @@ impl FileSystem for ManagedFileSystem { type Writer = ManagedWriter; fn create>(&self, path: P) -> IoResult { - let base = Arc::new(self.base_level_file_system.create(path.as_ref())?); + let base = Arc::new(self.base_file_system.create(path.as_ref())?); if let Some(ref manager) = self.key_manager { manager.new_file(path.as_ref().to_str().unwrap())?; } @@ -171,17 +182,83 @@ impl FileSystem for ManagedFileSystem { }) } - fn open>(&self, path: P) -> IoResult { + fn open>(&self, path: P, perm: Permission) -> IoResult { Ok(ManagedHandle { path: path.as_ref().to_path_buf(), - base: Arc::new(self.base_level_file_system.open(path.as_ref())?), + base: Arc::new(self.base_file_system.open(path.as_ref(), perm)?), }) } + fn delete>(&self, path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + manager.delete_file(path.as_ref().to_str().unwrap())?; + } + self.base_file_system.delete(path) + } + + fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + // Note: `rename` will reuse the old entryption info from `src_path`. + let src_str = src_path.as_ref().to_str().unwrap(); + let dst_str = dst_path.as_ref().to_str().unwrap(); + manager.link_file(src_str, dst_str)?; + let r = self + .base_file_system + .rename(src_path.as_ref(), dst_path.as_ref()); + let del_file = if r.is_ok() { src_str } else { dst_str }; + if let Err(e) = manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'rename'"; "err" => ?e); + } + r + } else { + self.base_file_system.rename(src_path, dst_path) + } + } + + fn reuse>(&self, src_path: P, dst_path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + // Note: In contrast to `rename`, `reuse` will make sure the encryption + // metadata is properly updated by rotating the encryption key for safety, + // when encryption flag is true. It won't rewrite the data blocks with + // the updated encryption metadata. Therefore, the old encrypted data + // won't be accessible after this calling. + let src_str = src_path.as_ref().to_str().unwrap(); + let dst_str = dst_path.as_ref().to_str().unwrap(); + manager.new_file(dst_path.as_ref().to_str().unwrap())?; + let r = self + .base_file_system + .rename(src_path.as_ref(), dst_path.as_ref()); + let del_file = if r.is_ok() { src_str } else { dst_str }; + if let Err(e) = manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'reuse'"; "err" => ?e); + } + r + } else { + self.base_file_system.rename(src_path, dst_path) + } + } + + fn exists_metadata>(&self, path: P) -> bool { + if let Some(ref manager) = self.key_manager { + if let Ok(info) = manager.get_file(path.as_ref().to_str().unwrap()) { + if info.method != EncryptionMethod::Plaintext { + return true; + } + } + } + self.base_file_system.exists_metadata(path) + } + + fn delete_metadata>(&self, path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + // Note: no error if the file doesn't exist. + manager.delete_file(path.as_ref().to_str().unwrap())?; + } + self.base_file_system.delete_metadata(path) + } + fn new_reader(&self, handle: Arc) -> IoResult { - let base_reader = self - .base_level_file_system - .new_reader(handle.base.clone())?; + let base_reader = self.base_file_system.new_reader(handle.base.clone())?; if let Some(ref key_manager) = self.key_manager { Ok(ManagedReader { inner: Either::Right(key_manager.open_file_with_reader(&handle.path, base_reader)?), @@ -196,9 +273,7 @@ impl FileSystem for ManagedFileSystem { } fn new_writer(&self, handle: Arc) -> IoResult { - let base_writer = self - .base_level_file_system - .new_writer(handle.base.clone())?; + let base_writer = self.base_file_system.new_writer(handle.base.clone())?; if let Some(ref key_manager) = self.key_manager { Ok(ManagedWriter { @@ -218,6 +293,37 @@ impl FileSystem for ManagedFileSystem { } } +/// Convert a cf to id for encoding. +fn cf_to_id(cf: &str) -> u8 { + match cf { + CF_DEFAULT => 0, + CF_LOCK => 1, + CF_WRITE => 2, + CF_RAFT => 3, + _ => panic!("unrecognized cf {}", cf), + } +} +const MAX_CF_ID: u8 = 3; + +/// Encode a key in the format `{prefix}{num}`. +fn encode_key(prefix: &'static [u8], num: u64) -> [u8; 9] { + debug_assert_eq!(prefix.len(), 1); + let mut buf = [0; 9]; + buf[..prefix.len()].copy_from_slice(prefix); + NumberCodec::encode_u64(&mut buf[prefix.len()..], num); + buf +} + +/// Encode a flush key in the format `{flush key prefix}{cf_id}{tablet_index}`. +fn encode_flushed_key(cf: &str, tablet_index: u64) -> [u8; 10] { + debug_assert_eq!(FLUSH_STATE_KEY.len(), 1); + let mut buf = [0; 10]; + buf[..FLUSH_STATE_KEY.len()].copy_from_slice(FLUSH_STATE_KEY); + buf[FLUSH_STATE_KEY.len()] = cf_to_id(cf); + NumberCodec::encode_u64(&mut buf[FLUSH_STATE_KEY.len() + 1..], tablet_index); + buf +} + #[derive(Clone)] pub struct RaftLogEngine(Arc>); @@ -225,7 +331,7 @@ impl RaftLogEngine { pub fn new( config: RaftEngineConfig, key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, ) -> Result { let file_system = Arc::new(ManagedFileSystem::new(key_manager, rate_limiter)); Ok(RaftLogEngine(Arc::new( @@ -239,7 +345,7 @@ impl RaftLogEngine { if !path.exists() || !path.is_dir() { return false; } - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } pub fn raft_groups(&self) -> Vec { @@ -255,22 +361,41 @@ impl RaftLogEngine { } } +impl PerfContextExt for RaftLogEngine { + type PerfContext = RaftEnginePerfContext; + + fn get_perf_context(_level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { + RaftEnginePerfContext + } +} + #[derive(Default)] pub struct RaftLogBatch(LogBatch); const RAFT_LOG_STATE_KEY: &[u8] = b"R"; +const STORE_IDENT_KEY: &[u8] = &[0x01]; +const PREPARE_BOOTSTRAP_REGION_KEY: &[u8] = &[0x02]; +const REGION_STATE_KEY: &[u8] = &[0x03]; +const APPLY_STATE_KEY: &[u8] = &[0x04]; +const RECOVER_STATE_KEY: &[u8] = &[0x05]; +const FLUSH_STATE_KEY: &[u8] = &[0x06]; +const DIRTY_MARK_KEY: &[u8] = &[0x07]; +// All keys are of the same length. +const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); impl RaftLogBatchTrait for RaftLogBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + _overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + // overwrite is handled within raft log engine. self.0 .add_entries::(raft_group_id, &entries) .map_err(transfer_error) } - fn cut_logs(&mut self, _: u64, _: u64, _: u64) { - // It's unnecessary because overlapped entries can be handled in `append`. - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.0 .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) @@ -288,6 +413,85 @@ impl RaftLogBatchTrait for RaftLogBatch { fn merge(&mut self, mut src: Self) -> Result<()> { self.0.merge(&mut src.0).map_err(transfer_error) } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + self.0 + .put_message(STORE_STATE_ID, STORE_IDENT_KEY.to_vec(), ident) + .map_err(transfer_error) + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + self.0 + .put_message( + STORE_STATE_ID, + PREPARE_BOOTSTRAP_REGION_KEY.to_vec(), + region, + ) + .map_err(transfer_error) + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + self.0 + .delete(STORE_STATE_ID, PREPARE_BOOTSTRAP_REGION_KEY.to_vec()); + Ok(()) + } + + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + let key = encode_key(REGION_STATE_KEY, apply_index); + self.0 + .put_message(raft_group_id, key.to_vec(), state) + .map_err(transfer_error) + } + + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { + let key = encode_key(APPLY_STATE_KEY, apply_index); + self.0 + .put_message(raft_group_id, key.to_vec(), state) + .map_err(transfer_error) + } + + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()> { + let key = encode_flushed_key(cf, tablet_index); + let mut value = vec![0; 8]; + NumberCodec::encode_u64(&mut value, apply_index); + self.0 + .put(raft_group_id, key.to_vec(), value) + .map_err(transfer_error) + } + + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + if dirty { + self.0 + .put(raft_group_id, key.to_vec(), vec![]) + .map_err(transfer_error) + } else { + self.0.delete(raft_group_id, key.to_vec()); + Ok(()) + } + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.0 + .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) + .map_err(transfer_error) + } } impl RaftEngineReadOnly for RaftLogEngine { @@ -324,6 +528,101 @@ impl RaftEngineReadOnly for RaftLogEngine { } Ok(()) } + + fn is_empty(&self) -> Result { + self.get_store_ident().map(|i| i.is_none()) + } + + fn get_store_ident(&self) -> Result> { + self.0 + .get_message(STORE_STATE_ID, STORE_IDENT_KEY) + .map_err(transfer_error) + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + self.0 + .get_message(STORE_STATE_ID, PREPARE_BOOTSTRAP_REGION_KEY) + .map_err(transfer_error) + } + + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + let mut state = None; + self.0 + .scan_messages( + raft_group_id, + Some(REGION_STATE_KEY), + Some(APPLY_STATE_KEY), + true, + |key, value| { + let index = NumberCodec::decode_u64(&key[REGION_STATE_KEY.len()..]); + if index > apply_index { + true + } else { + state = Some(value); + false + } + }, + ) + .map_err(transfer_error)?; + Ok(state) + } + + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + let mut state = None; + self.0 + .scan_messages( + raft_group_id, + Some(APPLY_STATE_KEY), + Some(RECOVER_STATE_KEY), + true, + |key, value| { + let index = NumberCodec::decode_u64(&key[REGION_STATE_KEY.len()..]); + if index > apply_index { + true + } else { + state = Some(value); + false + } + }, + ) + .map_err(transfer_error)?; + Ok(state) + } + + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result> { + let mut start = [0; 2]; + start[..FLUSH_STATE_KEY.len()].copy_from_slice(FLUSH_STATE_KEY); + start[FLUSH_STATE_KEY.len()] = cf_to_id(cf); + let mut end = start; + end[FLUSH_STATE_KEY.len()] += 1; + let mut index = None; + self.0 + .scan_raw_messages(raft_group_id, Some(&start), Some(&end), true, |_, v| { + index = Some(NumberCodec::decode_u64(v)); + false + }) + .map_err(transfer_error)?; + Ok(index) + } + + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + Ok(self.0.get(raft_group_id, &key).is_some()) + } + + fn get_recover_state(&self) -> Result> { + self.0 + .get_message(STORE_STATE_ID, RECOVER_STATE_KEY) + .map_err(transfer_error) + } } impl RaftEngineDebug for RaftLogEngine { @@ -356,6 +655,8 @@ impl RaftEngine for RaftLogEngine { } fn consume(&self, batch: &mut Self::LogBatch, sync: bool) -> Result { + // Always use ForegroundWrite as all `consume` calls share the same write queue. + let _guard = WithIoType::new(IoType::ForegroundWrite); self.0.write(&mut batch.0, sync).map_err(transfer_error) } @@ -366,6 +667,8 @@ impl RaftEngine for RaftLogEngine { _: usize, _: usize, ) -> Result { + // Always use ForegroundWrite as all `consume` calls share the same write queue. + let _guard = WithIoType::new(IoType::ForegroundWrite); self.0.write(&mut batch.0, sync).map_err(transfer_error) } @@ -380,72 +683,95 @@ impl RaftEngine for RaftLogEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut batch = Self::LogBatch::default(); + fn gc( + &self, + raft_group_id: u64, + _from: u64, + to: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { batch .0 - .add_entries::(raft_group_id, &entries) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, false).map_err(transfer_error) + .add_command(raft_group_id, Command::Compact { index: to }); + Ok(()) } - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - let mut batch = Self::LogBatch::default(); - batch - .0 - .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { + // Makes sure REGION_STATE_KEY is the smallest and FLUSH_STATE_KEY is the + // largest. + debug_assert!(REGION_STATE_KEY < APPLY_STATE_KEY); + debug_assert!(APPLY_STATE_KEY < FLUSH_STATE_KEY); + + let mut end = [0; KEY_PREFIX_LEN + 1]; + end[..KEY_PREFIX_LEN].copy_from_slice(FLUSH_STATE_KEY); + end[KEY_PREFIX_LEN] = MAX_CF_ID + 1; + let mut found_region_state = false; + let mut found_apply_state = false; + let mut found_flush_state = [false; MAX_CF_ID as usize + 1]; + self.0 + .scan_raw_messages( + raft_group_id, + Some(REGION_STATE_KEY), + Some(&end), + true, + |key, _| { + match &key[..KEY_PREFIX_LEN] { + REGION_STATE_KEY + if NumberCodec::decode_u64(&key[KEY_PREFIX_LEN..]) <= apply_index => + { + if found_region_state { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_region_state = true; + } + } + APPLY_STATE_KEY + if NumberCodec::decode_u64(&key[KEY_PREFIX_LEN..]) <= apply_index => + { + if found_apply_state { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_apply_state = true; + } + } + FLUSH_STATE_KEY => { + let cf_id = key[KEY_PREFIX_LEN]; + let tablet_index = NumberCodec::decode_u64(&key[KEY_PREFIX_LEN + 1..]); + if cf_id <= MAX_CF_ID && tablet_index <= apply_index { + if found_flush_state[cf_id as usize] { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_flush_state[cf_id as usize] = true; + } + } + } + _ => {} + } + true + }, + ) .map_err(transfer_error)?; - self.0.write(&mut batch.0, false).map_err(transfer_error)?; Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - self.batch_gc(vec![RaftLogGCTask { - raft_group_id, - from, - to, - }]) + fn need_manual_purge(&self) -> bool { + true } - fn batch_gc(&self, tasks: Vec) -> Result { - let mut batch = self.log_batch(tasks.len()); - let mut old_first_index = Vec::with_capacity(tasks.len()); - for task in &tasks { - batch - .0 - .add_command(task.raft_group_id, Command::Compact { index: task.to }); - old_first_index.push(self.0.first_index(task.raft_group_id)); - } - - self.0.write(&mut batch.0, false).map_err(transfer_error)?; - - let mut total = 0; - for (old_first_index, task) in old_first_index.iter().zip(tasks) { - let new_first_index = self.0.first_index(task.raft_group_id); - if let (Some(old), Some(new)) = (old_first_index, new_first_index) { - total += new.saturating_sub(*old); - } - } - Ok(total as usize) - } - - fn purge_expired_files(&self) -> Result> { + fn manual_purge(&self) -> Result> { self.0.purge_expired_files().map_err(transfer_error) } - fn has_builtin_entry_cache(&self) -> bool { - false - } - - fn gc_entry_cache(&self, _raft_group_id: u64, _to: u64) {} - /// Flush current cache stats. fn flush_stats(&self) -> Option { None } - fn stop(&self) {} - fn dump_stats(&self) -> Result { // Raft engine won't dump anything. Ok("".to_owned()) @@ -454,6 +780,23 @@ impl RaftEngine for RaftLogEngine { fn get_engine_size(&self) -> Result { Ok(self.0.get_used_size() as u64) } + + fn get_engine_path(&self) -> &str { + self.0.path() + } + + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + for id in self.0.raft_groups() { + if id != STORE_STATE_ID { + f(id)?; + } + } + Ok(()) + } } fn transfer_error(e: RaftEngineError) -> engine_traits::Error { @@ -466,3 +809,67 @@ fn transfer_error(e: RaftEngineError) -> engine_traits::Error { } } } + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use engine_traits::ALL_CFS; + + use super::*; + + #[test] + fn test_apply_related_states() { + let dir = tempfile::tempdir().unwrap(); + let cfg = RaftEngineConfig { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = RaftLogEngine::new(cfg, None, None).unwrap(); + assert_matches!(engine.get_region_state(2, u64::MAX), Ok(None)); + assert_matches!(engine.get_apply_state(2, u64::MAX), Ok(None)); + for cf in ALL_CFS { + assert_matches!(engine.get_flushed_index(2, cf), Ok(None)); + } + + let mut wb = engine.log_batch(10); + let mut region_state = RegionLocalState::default(); + region_state.mut_region().set_id(3); + wb.put_region_state(2, 1, ®ion_state).unwrap(); + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(3); + wb.put_apply_state(2, 3, &apply_state).unwrap(); + for cf in ALL_CFS.iter().take(2) { + wb.put_flushed_index(2, cf, 5, 4).unwrap(); + } + engine.consume(&mut wb, false).unwrap(); + + for cf in ALL_CFS.iter().take(2) { + assert_matches!(engine.get_flushed_index(2, cf), Ok(Some(4))); + } + for cf in ALL_CFS.iter().skip(2) { + assert_matches!(engine.get_flushed_index(2, cf), Ok(None)); + } + + let mut region_state2 = region_state.clone(); + region_state2.mut_region().set_id(5); + wb.put_region_state(2, 4, ®ion_state2).unwrap(); + let mut apply_state2 = apply_state.clone(); + apply_state2.set_applied_index(5); + wb.put_apply_state(2, 5, &apply_state2).unwrap(); + for cf in ALL_CFS { + wb.put_flushed_index(2, cf, 6, 5).unwrap(); + } + engine.consume(&mut wb, false).unwrap(); + + assert_matches!(engine.get_region_state(2, 0), Ok(None)); + assert_matches!(engine.get_region_state(2, 1), Ok(Some(s)) if s == region_state); + assert_matches!(engine.get_region_state(2, 4), Ok(Some(s)) if s == region_state2); + assert_matches!(engine.get_apply_state(2, 0), Ok(None)); + assert_matches!(engine.get_apply_state(2, 3), Ok(Some(s)) if s == apply_state); + assert_matches!(engine.get_apply_state(2, 5), Ok(Some(s)) if s == apply_state2); + for cf in ALL_CFS { + assert_matches!(engine.get_flushed_index(2, cf), Ok(Some(5))); + } + } +} diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 8b83acfe6be..25899ddf2bb 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -10,15 +10,20 @@ //! Because there are so many similarly named types across the TiKV codebase, //! and so much "import renaming", this crate consistently explicitly names type //! that implement a trait as `RocksTraitname`, to avoid the need for import -//! renaming and make it obvious what type any particular module is working with. +//! renaming and make it obvious what type any particular module is working +//! with. //! //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] +#![feature(assert_matches)] #[macro_use] extern crate tikv_util; mod engine; -pub use engine::{RaftEngineConfig, RaftLogBatch, RaftLogEngine, ReadableSize, RecoveryMode}; +mod perf_context; + +pub use engine::{ + ManagedFileSystem, RaftEngineConfig, RaftLogBatch, RaftLogEngine, ReadableSize, RecoveryMode, +}; diff --git a/components/raft_log_engine/src/perf_context.rs b/components/raft_log_engine/src/perf_context.rs new file mode 100644 index 00000000000..87946e2f48e --- /dev/null +++ b/components/raft_log_engine/src/perf_context.rs @@ -0,0 +1,29 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use raft_engine::get_perf_context; +use tracker::{TrackerToken, GLOBAL_TRACKERS}; + +#[derive(Debug)] +pub struct RaftEnginePerfContext; + +impl engine_traits::PerfContext for RaftEnginePerfContext { + fn start_observe(&mut self) { + raft_engine::set_perf_context(Default::default()); + } + + fn report_metrics(&mut self, trackers: &[TrackerToken]) { + let perf_context = get_perf_context(); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.store_thread_wait_nanos = + perf_context.write_wait_duration.as_nanos() as u64; + t.metrics.store_write_wal_nanos = (perf_context.log_write_duration + + perf_context.log_sync_duration + + perf_context.log_rotate_duration) + .as_nanos() as u64; + t.metrics.store_write_memtable_nanos = + perf_context.apply_duration.as_nanos() as u64; + }); + } + } +} diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml new file mode 100644 index 00000000000..84daa4c40b5 --- /dev/null +++ b/components/raftstore-v2/Cargo.toml @@ -0,0 +1,84 @@ +[package] +name = "raftstore-v2" +version = "0.1.0" +edition = "2021" + +[features] +default = ["testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +failpoints = ["raftstore/failpoints"] +testexport = ["raftstore/testexport"] +test-engine-kv-rocksdb = [ + "raftstore/test-engine-kv-rocksdb", + "engine_test/test-engine-kv-rocksdb", +] +test-engine-raft-raft-engine = [ + "raftstore/test-engine-raft-raft-engine", + "engine_test/test-engine-raft-raft-engine", +] +test-engines-rocksdb = [ + "raftstore/test-engines-rocksdb", + "engine_test/test-engines-rocksdb", +] +test-engines-panic = [ + "raftstore/test-engines-panic", + "engine_test/test-engines-panic", +] + +cloud-aws = ["raftstore/cloud-aws"] +cloud-gcp = ["raftstore/cloud-gcp"] +cloud-azure = ["raftstore/cloud-azure"] + +[dependencies] +batch-system = { workspace = true } +bytes = "1.0" +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } +crossbeam = "0.8" +encryption_export = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +fail = "0.5" +file_system = { workspace = true } +fs2 = "0.4" +futures = { version = "0.3", features = ["compat"] } +keys = { workspace = true } +kvproto = { workspace = true } +log_wrappers = { workspace = true } +parking_lot = "0.12" +pd_client = { workspace = true } +prometheus = { version = "0.13", features = ["nightly"] } +protobuf = { version = "2.8", features = ["bytes"] } +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } +raft-proto = { version = "0.7.0" } +raftstore = { workspace = true } +rand = "0.8.3" +resource_control = { workspace = true } +resource_metering = { workspace = true } +slog = "2.3" +smallvec = "1.4" +sst_importer = { workspace = true } +thiserror = "1.0" +tikv_util = { workspace = true } +time = "0.1" +tracker = { workspace = true } +txn_types = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } + +[dev-dependencies] +engine_rocks = { workspace = true } +engine_test = { workspace = true } +slog-global = { workspace = true } +tempfile = "3.0" +test_pd = { workspace = true } +test_util = { workspace = true } + +[[test]] +name = "raftstore-v2-failpoints" +path = "tests/failpoints/mod.rs" +required-features = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] + +[[test]] +name = "raftstore-v2-integrations" +path = "tests/integrations/mod.rs" +required-features = ["testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] diff --git a/components/raftstore-v2/src/batch/mod.rs b/components/raftstore-v2/src/batch/mod.rs new file mode 100644 index 00000000000..7daeebaa8f0 --- /dev/null +++ b/components/raftstore-v2/src/batch/mod.rs @@ -0,0 +1,10 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the specialized implementation of batch systems. +//! +//! StoreSystem is used for polling raft state machines, ApplySystem is used for +//! applying logs. + +mod store; + +pub use store::{create_store_batch_system, StoreContext, StoreRouter, StoreSystem}; diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs new file mode 100644 index 00000000000..1f6245cc010 --- /dev/null +++ b/components/raftstore-v2/src/batch/store.rs @@ -0,0 +1,831 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cmp, + ops::{Deref, DerefMut}, + path::Path, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use batch_system::{ + BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, +}; +use causal_ts::CausalTsProviderImpl; +use collections::HashMap; +use concurrency_manager::ConcurrencyManager; +use crossbeam::channel::TrySendError; +use encryption_export::DataKeyManager; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use file_system::{set_io_type, IoType, WithIoType}; +use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; +use pd_client::PdClient; +use raft::{StateRole, INVALID_ID}; +use raftstore::{ + coprocessor::{CoprocessorHost, RegionChangeEvent}, + store::{ + fsm::{ + store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, + GlobalStoreStat, LocalStoreStat, + }, + local_metrics::RaftMetrics, + AutoSplitController, Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, + StoreWriters, TabletSnapManager, Transport, WriteSenders, + }, +}; +use resource_metering::CollectorRegHandle; +use slog::{warn, Logger}; +use sst_importer::SstImporter; +use tikv_util::{ + box_err, + config::{Tracker, VersionTrack}, + log::SlogFormat, + sys::SysQuota, + time::{duration_to_sec, Instant as TiInstant}, + timer::SteadyTimer, + worker::{LazyWorker, Scheduler, Worker}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, + Either, +}; +use time::Timespec; + +use crate::{ + fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, + operation::{SharedReadTablet, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX}, + raft::Storage, + router::{PeerMsg, PeerTick, StoreMsg}, + worker::{pd, tablet}, + Error, Result, +}; + +/// A per-thread context shared by the [`StoreFsm`] and multiple [`PeerFsm`]s. +pub struct StoreContext { + /// A logger without any KV. It's clean for creating new PeerFSM. + pub logger: Logger, + pub store_id: u64, + pub coprocessor_host: CoprocessorHost, + /// The transport for sending messages to peers on other stores. + pub trans: T, + pub current_time: Option, + pub has_ready: bool, + pub raft_metrics: RaftMetrics, + /// The latest configuration. + pub cfg: Config, + pub router: StoreRouter, + /// The tick batch for delay ticking. It will be flushed at the end of every + /// round. + pub tick_batch: Vec, + /// The precise timer for scheduling tick. + pub timer: SteadyTimer, + pub schedulers: Schedulers, + /// store meta + pub store_meta: Arc>>, + pub shutdown: Arc, + pub engine: ER, + pub tablet_registry: TabletRegistry, + pub apply_pool: FuturePool, + + /// Disk usage for the store itself. + pub self_disk_usage: DiskUsage, + + pub snap_mgr: TabletSnapManager, + pub global_stat: GlobalStoreStat, + pub store_stat: LocalStoreStat, + pub sst_importer: Arc, + pub key_manager: Option>, +} + +impl StoreContext { + pub fn update_ticks_timeout(&mut self) { + self.tick_batch[PeerTick::Raft as usize].wait_duration = self.cfg.raft_base_tick_interval.0; + self.tick_batch[PeerTick::CompactLog as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0; + self.tick_batch[PeerTick::EntryCacheEvict as usize].wait_duration = + ENTRY_CACHE_EVICT_TICK_DURATION; + self.tick_batch[PeerTick::PdHeartbeat as usize].wait_duration = + self.cfg.pd_heartbeat_tick_interval.0; + self.tick_batch[PeerTick::SplitRegionCheck as usize].wait_duration = + self.cfg.split_region_check_tick_interval.0; + self.tick_batch[PeerTick::CheckPeerStaleState as usize].wait_duration = + self.cfg.peer_stale_state_check_interval.0; + self.tick_batch[PeerTick::CheckMerge as usize].wait_duration = + self.cfg.merge_check_tick_interval.0; + self.tick_batch[PeerTick::CheckLeaderLease as usize].wait_duration = + self.cfg.check_leader_lease_interval.0; + self.tick_batch[PeerTick::ReactivateMemoryLock as usize].wait_duration = + self.cfg.reactive_memory_lock_tick_interval.0; + self.tick_batch[PeerTick::ReportBuckets as usize].wait_duration = + self.cfg.report_region_buckets_tick_interval.0; + self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = + self.cfg.check_long_uncommitted_interval.0; + self.tick_batch[PeerTick::GcPeer as usize].wait_duration = + 60 * cmp::min(Duration::from_secs(1), self.cfg.raft_base_tick_interval.0); + } +} + +/// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. +/// +/// It is responsible for: +/// +/// - Keeping the local [`StoreContext`] up-to-date. +/// - Receiving and sending messages in and out of these FSMs. +struct StorePoller { + poll_ctx: StoreContext, + cfg_tracker: Tracker, + /// Buffers to hold in-coming messages. + store_msg_buf: Vec, + peer_msg_buf: Vec, + timer: tikv_util::time::Instant, + /// These fields controls the timing of flushing messages generated by + /// FSMs. + last_flush_time: TiInstant, + need_flush_events: bool, +} + +impl StorePoller { + pub fn new(poll_ctx: StoreContext, cfg_tracker: Tracker) -> Self { + Self { + poll_ctx, + cfg_tracker, + store_msg_buf: Vec::new(), + peer_msg_buf: Vec::new(), + timer: tikv_util::time::Instant::now(), + last_flush_time: TiInstant::now(), + need_flush_events: false, + } + } + + /// Updates the internal buffer to match the latest configuration. + fn apply_buf_capacity(&mut self) { + let new_cap = self.messages_per_tick(); + tikv_util::set_vec_capacity(&mut self.store_msg_buf, new_cap); + tikv_util::set_vec_capacity(&mut self.peer_msg_buf, new_cap); + } + + #[inline] + fn messages_per_tick(&self) -> usize { + self.poll_ctx.cfg.messages_per_tick + } + + fn flush_events(&mut self) { + self.schedule_ticks(); + self.poll_ctx.raft_metrics.maybe_flush(); + self.poll_ctx.store_stat.flush(); + } + + fn schedule_ticks(&mut self) { + assert_eq!(PeerTick::all_ticks().len(), self.poll_ctx.tick_batch.len()); + for batch in &mut self.poll_ctx.tick_batch { + batch.schedule(&self.poll_ctx.timer); + } + } +} + +impl PollHandler, StoreFsm> + for StorePoller +{ + fn begin(&mut self, _batch_size: usize, update_cfg: F) + where + for<'a> F: FnOnce(&'a batch_system::Config), + { + if self.store_msg_buf.capacity() == 0 || self.peer_msg_buf.capacity() == 0 { + self.apply_buf_capacity(); + } + // Apply configuration changes. + if let Some(cfg) = self.cfg_tracker.any_new().map(|c| c.clone()) { + let last_messages_per_tick = self.messages_per_tick(); + self.poll_ctx.cfg = cfg; + if self.poll_ctx.cfg.messages_per_tick != last_messages_per_tick { + self.apply_buf_capacity(); + } + update_cfg(&self.poll_ctx.cfg.store_batch_system); + self.poll_ctx.update_ticks_timeout(); + } + self.poll_ctx.has_ready = false; + self.poll_ctx.current_time = None; + self.timer = tikv_util::time::Instant::now(); + } + + fn handle_control(&mut self, fsm: &mut StoreFsm) -> Option { + debug_assert!(self.store_msg_buf.is_empty()); + let batch_size = self.messages_per_tick(); + let received_cnt = fsm.recv(&mut self.store_msg_buf, batch_size); + let expected_msg_count = if received_cnt == batch_size { + None + } else { + Some(0) + }; + let mut delegate = StoreFsmDelegate::new(fsm, &mut self.poll_ctx); + delegate.handle_msgs(&mut self.store_msg_buf); + expected_msg_count + } + + fn handle_normal(&mut self, fsm: &mut impl DerefMut>) -> HandleResult { + debug_assert!(self.peer_msg_buf.is_empty()); + let batch_size = self.messages_per_tick(); + let received_cnt = fsm.recv(&mut self.peer_msg_buf, batch_size); + let handle_result = if received_cnt == batch_size { + HandleResult::KeepProcessing + } else { + HandleResult::stop_at(0, false) + }; + let mut delegate = PeerFsmDelegate::new(fsm, &mut self.poll_ctx); + delegate.on_msgs(&mut self.peer_msg_buf); + delegate + .fsm + .peer_mut() + .handle_raft_ready(delegate.store_ctx); + handle_result + } + + fn light_end(&mut self, _batch: &mut [Option>>]) { + if self.poll_ctx.trans.need_flush() { + self.poll_ctx.trans.flush(); + } + + let now = TiInstant::now(); + if now.saturating_duration_since(self.last_flush_time) >= Duration::from_millis(1) { + self.last_flush_time = now; + self.need_flush_events = false; + self.flush_events(); + } else { + self.need_flush_events = true; + } + } + + fn end(&mut self, _batch: &mut [Option>>]) { + let dur = self.timer.saturating_elapsed(); + self.poll_ctx + .raft_metrics + .process_ready + .observe(duration_to_sec(dur)); + } + + fn pause(&mut self) { + if self.poll_ctx.trans.need_flush() { + self.poll_ctx.trans.flush(); + } + + if self.need_flush_events { + self.last_flush_time = TiInstant::now(); + self.need_flush_events = false; + self.flush_events(); + } + } +} + +struct StorePollerBuilder { + cfg: Arc>, + coprocessor_host: CoprocessorHost, + store_id: u64, + engine: ER, + tablet_registry: TabletRegistry, + trans: T, + router: StoreRouter, + schedulers: Schedulers, + apply_pool: FuturePool, + logger: Logger, + store_meta: Arc>>, + shutdown: Arc, + snap_mgr: TabletSnapManager, + global_stat: GlobalStoreStat, + sst_importer: Arc, + key_manager: Option>, +} + +impl StorePollerBuilder { + pub fn new( + cfg: Arc>, + store_id: u64, + engine: ER, + tablet_registry: TabletRegistry, + trans: T, + router: StoreRouter, + schedulers: Schedulers, + logger: Logger, + store_meta: Arc>>, + shutdown: Arc, + snap_mgr: TabletSnapManager, + coprocessor_host: CoprocessorHost, + sst_importer: Arc, + key_manager: Option>, + ) -> Self { + let pool_size = cfg.value().apply_batch_system.pool_size; + let max_pool_size = std::cmp::max( + pool_size, + std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), + ); + let apply_pool = YatpPoolBuilder::new(DefaultTicker::default()) + .thread_count(1, pool_size, max_pool_size) + .after_start(move || set_io_type(IoType::ForegroundWrite)) + .name_prefix("apply") + .build_future_pool(); + let global_stat = GlobalStoreStat::default(); + StorePollerBuilder { + cfg, + store_id, + engine, + tablet_registry, + trans, + router, + apply_pool, + logger, + schedulers, + store_meta, + snap_mgr, + shutdown, + coprocessor_host, + global_stat, + sst_importer, + key_manager, + } + } + + /// Initializes all the existing raft machines and cleans up stale tablets. + fn init(&self) -> Result>> { + let mut regions = HashMap::default(); + let cfg = self.cfg.value(); + let mut meta = self.store_meta.lock().unwrap(); + self.engine + .for_each_raft_group::(&mut |region_id| { + assert_ne!(region_id, INVALID_ID); + let storage = match Storage::new( + region_id, + self.store_id, + self.engine.clone(), + self.schedulers.read.clone(), + &self.logger, + )? { + Some(p) => p, + None => return Ok(()), + }; + + if storage.is_initialized() { + self.coprocessor_host.on_region_changed( + storage.region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); + } + meta.set_region(storage.region(), storage.is_initialized(), &self.logger); + + let (sender, peer_fsm) = PeerFsm::new( + &cfg, + &self.tablet_registry, + self.key_manager.as_deref(), + &self.snap_mgr, + storage, + )?; + meta.region_read_progress + .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); + + let prev = regions.insert(region_id, (sender, peer_fsm)); + if let Some((_, p)) = prev { + return Err(box_err!( + "duplicate region {} vs {}", + SlogFormat(p.logger()), + SlogFormat(regions[®ion_id].1.logger()) + )); + } + Ok(()) + })?; + self.clean_up_tablets(®ions)?; + Ok(regions) + } + + #[inline] + fn remove_dir(&self, p: &Path) -> Result<()> { + if let Some(m) = &self.key_manager { + m.remove_dir(p, None)?; + } + file_system::remove_dir_all(p)?; + Ok(()) + } + + fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { + for entry in file_system::read_dir(self.tablet_registry.tablet_root())? { + let entry = entry?; + let path = entry.path(); + if path.extension().map_or(false, |s| s == "tmp") { + // The directory may be generated by an aborted checkpoint. + self.remove_dir(&path)?; + continue; + } + let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; + // Keep the checkpoint even if source is destroyed. + if prefix == MERGE_SOURCE_PREFIX { + continue; + } + let fsm = match peers.get(®ion_id) { + Some((_, fsm)) => fsm, + None => { + // The peer is either destroyed or not created yet. It will be + // recovered by leader heartbeats. + self.remove_dir(&path)?; + continue; + } + }; + // Valid split tablet should be installed during recovery. + if prefix == SPLIT_PREFIX { + self.remove_dir(&path)?; + continue; + } else if prefix == MERGE_IN_PROGRESS_PREFIX { + continue; + } else if prefix.is_empty() { + // Stale split data can be deleted. + if fsm.peer().storage().tablet_index() > tablet_index { + self.remove_dir(&path)?; + } + } else { + debug_assert!(false, "unexpected tablet prefix: {}", path.display()); + warn!(self.logger, "unexpected tablet prefix"; "path" => %path.display()); + } + } + // TODO: list all available tablets and destroy those which are not in the + // peers. + Ok(()) + } +} + +impl HandlerBuilder, StoreFsm> for StorePollerBuilder +where + ER: RaftEngine, + EK: KvEngine, + T: Transport + 'static, +{ + type Handler = StorePoller; + + fn build(&mut self, _priority: batch_system::Priority) -> Self::Handler { + let cfg = self.cfg.value().clone(); + let mut poll_ctx = StoreContext { + logger: self.logger.clone(), + store_id: self.store_id, + trans: self.trans.clone(), + current_time: None, + has_ready: false, + raft_metrics: RaftMetrics::new(cfg.waterfall_metrics), + cfg, + router: self.router.clone(), + tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], + timer: SteadyTimer::default(), + schedulers: self.schedulers.clone(), + store_meta: self.store_meta.clone(), + shutdown: self.shutdown.clone(), + engine: self.engine.clone(), + tablet_registry: self.tablet_registry.clone(), + apply_pool: self.apply_pool.clone(), + self_disk_usage: DiskUsage::Normal, + snap_mgr: self.snap_mgr.clone(), + coprocessor_host: self.coprocessor_host.clone(), + global_stat: self.global_stat.clone(), + store_stat: self.global_stat.local(), + sst_importer: self.sst_importer.clone(), + key_manager: self.key_manager.clone(), + }; + poll_ctx.update_ticks_timeout(); + let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); + StorePoller::new(poll_ctx, cfg_tracker) + } +} + +#[derive(Clone)] +pub struct Schedulers { + pub read: Scheduler>, + pub pd: Scheduler, + pub tablet: Scheduler>, + pub write: WriteSenders, + + // Following is not maintained by raftstore itself. + pub split_check: Scheduler, +} + +impl Schedulers { + fn stop(&self) { + self.read.stop(); + self.pd.stop(); + self.tablet.stop(); + self.split_check.stop(); + } +} + +/// A set of background threads that will processing offloaded work from +/// raftstore. +struct Workers { + /// Worker for fetching raft logs asynchronously + async_read: Worker, + pd: LazyWorker, + tablet: Worker, + async_write: StoreWriters, + purge: Option, + + // Following is not maintained by raftstore itself. + background: Worker, +} + +impl Workers { + fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { + Self { + async_read: Worker::new("async-read-worker"), + pd, + tablet: Worker::new("tablet-worker"), + async_write: StoreWriters::new(None), + purge, + background, + } + } + + fn stop(mut self) { + self.async_write.shutdown(); + self.async_read.stop(); + self.pd.stop(); + self.tablet.stop(); + if let Some(w) = self.purge { + w.stop(); + } + } +} + +/// The system used for polling Raft activities. +pub struct StoreSystem { + system: BatchSystem, StoreFsm>, + workers: Option>, + schedulers: Option>, + logger: Logger, + shutdown: Arc, +} + +impl StoreSystem { + pub fn start( + &mut self, + store_id: u64, + cfg: Arc>, + raft_engine: ER, + tablet_registry: TabletRegistry, + trans: T, + pd_client: Arc, + router: &StoreRouter, + store_meta: Arc>>, + snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, + background: Worker, + pd_worker: LazyWorker, + sst_importer: Arc, + key_manager: Option>, + ) -> Result<()> + where + T: Transport + 'static, + C: PdClient + 'static, + { + let sync_router = Mutex::new(router.clone()); + pd_client.handle_reconnect(move || { + sync_router + .lock() + .unwrap() + .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); + }); + + let purge_worker = if raft_engine.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { + let worker = Worker::new("purge-worker"); + let raft_clone = raft_engine.clone(); + let logger = self.logger.clone(); + let router = router.clone(); + worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + let _guard = WithIoType::new(IoType::RewriteLog); + match raft_clone.manual_purge() { + Ok(regions) => { + for r in regions { + let _ = router.send(r, PeerMsg::ForceCompactLog); + } + } + Err(e) => { + warn!(logger, "purge expired files"; "err" => %e); + } + }; + }); + Some(worker) + } else { + None + }; + + let mut workers = Workers::new(background, pd_worker, purge_worker); + workers + .async_write + .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; + + let mut read_runner = ReadRunner::new(router.clone(), raft_engine.clone()); + read_runner.set_snap_mgr(snap_mgr.clone()); + let read_scheduler = workers.async_read.start("async-read-worker", read_runner); + + workers.pd.start(pd::Runner::new( + store_id, + pd_client, + raft_engine.clone(), + tablet_registry.clone(), + snap_mgr.clone(), + router.clone(), + workers.pd.remote(), + concurrency_manager, + causal_ts_provider, + workers.pd.scheduler(), + auto_split_controller, + store_meta.lock().unwrap().region_read_progress.clone(), + collector_reg_handle, + self.logger.clone(), + self.shutdown.clone(), + cfg.clone(), + )?); + + let split_check_scheduler = workers.background.start( + "split-check", + SplitCheckRunner::with_registry( + tablet_registry.clone(), + router.clone(), + coprocessor_host.clone(), + ), + ); + + let tablet_gc_scheduler = workers.tablet.start_with_timer( + "tablet-worker", + tablet::Runner::new( + tablet_registry.clone(), + sst_importer.clone(), + self.logger.clone(), + ), + ); + + let schedulers = Schedulers { + read: read_scheduler, + pd: workers.pd.scheduler(), + tablet: tablet_gc_scheduler, + write: workers.async_write.senders(), + split_check: split_check_scheduler, + }; + + let builder = StorePollerBuilder::new( + cfg.clone(), + store_id, + raft_engine, + tablet_registry, + trans, + router.clone(), + schedulers.clone(), + self.logger.clone(), + store_meta.clone(), + self.shutdown.clone(), + snap_mgr, + coprocessor_host, + sst_importer, + key_manager, + ); + self.workers = Some(workers); + self.schedulers = Some(schedulers); + let peers = builder.init()?; + // Choose a different name so we know what version is actually used. rs stands + // for raft store. + let tag = format!("rs-{}", store_id); + self.system.spawn(tag, builder); + + let mut mailboxes = Vec::with_capacity(peers.len()); + let mut address = Vec::with_capacity(peers.len()); + { + let mut meta = store_meta.as_ref().lock().unwrap(); + for (region_id, (tx, mut fsm)) in peers { + if let Some(tablet) = fsm.peer_mut().tablet() { + let read_tablet = SharedReadTablet::new(tablet.clone()); + meta.readers.insert( + region_id, + (fsm.peer().generate_read_delegate(), read_tablet), + ); + } + + address.push(region_id); + mailboxes.push(( + region_id, + BasicMailbox::new(tx, fsm, router.state_cnt().clone()), + )); + } + } + router.register_all(mailboxes); + + // Make sure Msg::Start is the first message each FSM received. + for addr in address { + router.force_send(addr, PeerMsg::Start).unwrap(); + } + router.send_control(StoreMsg::Start).unwrap(); + Ok(()) + } + + pub fn shutdown(&mut self) { + self.shutdown.store(true, Ordering::Relaxed); + + if self.workers.is_none() { + return; + } + let workers = self.workers.take().unwrap(); + + // TODO: gracefully shutdown future apply pool + + // Stop schedulers first, so all background future worker pool will be stopped + // gracefully. + self.schedulers.take().unwrap().stop(); + self.system.shutdown(); + + workers.stop(); + } +} + +#[derive(Clone)] +pub struct StoreRouter { + router: BatchRouter, StoreFsm>, + logger: Logger, +} + +impl StoreRouter { + #[inline] + pub fn logger(&self) -> &Logger { + &self.logger + } + + #[inline] + pub fn check_send(&self, addr: u64, msg: PeerMsg) -> crate::Result<()> { + match self.router.send(addr, msg) { + Ok(()) => Ok(()), + Err(e) => Err(raftstore::router::handle_send_error(addr, e)), + } + } + + pub fn send_raft_message( + &self, + msg: Box, + ) -> std::result::Result<(), TrySendError>> { + let id = msg.get_region_id(); + let peer_msg = PeerMsg::RaftMessage(msg); + let store_msg = match self.router.try_send(id, peer_msg) { + Either::Left(Ok(())) => return Ok(()), + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { + return Err(TrySendError::Full(m)); + } + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m)))) => { + return Err(TrySendError::Disconnected(m)); + } + Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), + _ => unreachable!(), + }; + match self.router.send_control(store_msg) { + Ok(()) => Ok(()), + Err(TrySendError::Full(StoreMsg::RaftMessage(m))) => Err(TrySendError::Full(m)), + Err(TrySendError::Disconnected(StoreMsg::RaftMessage(m))) => { + Err(TrySendError::Disconnected(m)) + } + _ => unreachable!(), + } + } +} + +impl Deref for StoreRouter { + type Target = BatchRouter, StoreFsm>; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.router + } +} + +impl DerefMut for StoreRouter { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.router + } +} + +/// Creates the batch system for polling raft activities. +pub fn create_store_batch_system( + cfg: &Config, + store_id: u64, + logger: Logger, +) -> (StoreRouter, StoreSystem) +where + EK: KvEngine, + ER: RaftEngine, +{ + let (store_tx, store_fsm) = StoreFsm::new(cfg, store_id, logger.clone()); + let (router, system) = + batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm, None); + let system = StoreSystem { + system, + workers: None, + schedulers: None, + logger: logger.clone(), + shutdown: Arc::new(AtomicBool::new(false)), + }; + (StoreRouter { router, logger }, system) +} diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs new file mode 100644 index 00000000000..62bc9e4b8c5 --- /dev/null +++ b/components/raftstore-v2/src/bootstrap.rs @@ -0,0 +1,259 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{thread, time::Duration}; + +use engine_traits::{RaftEngine, RaftLogBatch}; +use error_code::ErrorCodeExt; +use fail::fail_point; +use kvproto::{ + metapb::{Region, Store}, + raft_serverpb::{RaftLocalState, StoreIdent}, +}; +use pd_client::PdClient; +use raft::INVALID_ID; +use raftstore::store::initial_region; +use slog::{debug, error, info, warn, Logger}; +use tikv_util::{box_err, box_try}; + +use crate::{operation::write_initial_states, Result}; + +const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; +const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); + +/// A struct for bootstrapping the store. +/// +/// A typical bootstrap process should take the following steps: +/// +/// 1. Calls `bootstrap_store` to bootstrap the store. +/// 2. Calls `bootstrap_first_region` to bootstrap the first region using store +/// ID returned from last step. +/// +/// # Safety +/// +/// These steps are re-entrant, i.e. the caller can redo any steps whether or +/// not they fail or succeed. +pub struct Bootstrap<'a, ER: RaftEngine> { + engine: &'a ER, + cluster_id: u64, + // It's not performance critical. + pd_client: &'a dyn PdClient, + logger: Logger, +} + +// Although all methods won't change internal state, but they still receive +// `&mut self` as it's not thread safe to bootstrap concurrently. +impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { + pub fn new( + engine: &'a ER, + cluster_id: u64, + pd_client: &'a impl PdClient, + logger: Logger, + ) -> Self { + Self { + engine, + cluster_id, + pd_client, + logger, + } + } + + /// Gets and validates the store ID from engine if it's already + /// bootstrapped. + fn check_store_id_in_engine(&mut self) -> Result> { + let ident = match self.engine.get_store_ident()? { + Some(ident) => ident, + None => return Ok(None), + }; + if ident.get_cluster_id() != self.cluster_id { + return Err(box_err!( + "cluster ID mismatch, local {} != remote {}, \ + you are trying to connect to another cluster, \ + please reconnect to the correct PD", + ident.get_cluster_id(), + self.cluster_id + )); + } + if ident.get_store_id() == INVALID_ID { + return Err(box_err!("invalid store ident {:?}", ident)); + } + Ok(Some(ident.get_store_id())) + } + + /// Bootstraps the store and returns the store ID. + /// + /// The bootstrapping basically allocates a new store ID from PD and writes + /// it to engine with sync=true. + /// + /// If the store is already bootstrapped, return the store ID directly. + pub fn bootstrap_store(&mut self) -> Result { + if let Some(id) = self.check_store_id_in_engine()? { + return Ok(id); + } + if !self.engine.is_empty()? { + return Err(box_err!("store is not empty and has already had data")); + } + let id = self.pd_client.alloc_id()?; + debug!(self.logger, "alloc store id"; "store_id" => id); + let mut ident = StoreIdent::default(); + ident.set_cluster_id(self.cluster_id); + ident.set_store_id(id); + let mut lb = self.engine.log_batch(1); + lb.put_store_ident(&ident)?; + self.engine.consume(&mut lb, true)?; + fail_point!("node_after_bootstrap_store", |_| Err(box_err!( + "injected error: node_after_bootstrap_store" + ))); + Ok(id) + } + + fn prepare_bootstrap_first_region(&mut self, store_id: u64) -> Result { + let region_id = self.pd_client.alloc_id()?; + debug!( + self.logger, + "alloc first region id"; + "region_id" => region_id, + "cluster_id" => self.cluster_id, + "store_id" => store_id + ); + let peer_id = self.pd_client.alloc_id()?; + debug!( + self.logger, + "alloc first peer id for first region"; + "peer_id" => peer_id, + "region_id" => region_id, + ); + + let region = initial_region(store_id, region_id, peer_id); + + let mut wb = self.engine.log_batch(10); + wb.put_prepare_bootstrap_region(®ion)?; + write_initial_states(&mut wb, region.clone())?; + box_try!(self.engine.consume(&mut wb, true)); + + Ok(region) + } + + fn check_pd_first_region_bootstrapped(&mut self) -> Result { + for _ in 0..MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { + match self.pd_client.is_cluster_bootstrapped() { + Ok(b) => return Ok(b), + Err(e) => { + warn!(self.logger, "check cluster bootstrapped failed"; "err" => ?e); + } + } + thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); + } + Err(box_err!("check cluster bootstrapped failed")) + } + + fn clear_prepare_bootstrap(&mut self, first_region_id: Option) -> Result<()> { + let mut wb = self.engine.log_batch(10); + wb.remove_prepare_bootstrap_region()?; + if let Some(id) = first_region_id { + box_try!( + self.engine + .clean(id, 0, &RaftLocalState::default(), &mut wb) + ); + } + box_try!(self.engine.consume(&mut wb, true)); + Ok(()) + } + + /// Bootstraps the first region of this cluster. + /// + /// The bootstrapping starts by allocating a region ID from PD. Then it + /// initializes the region's state and writes a preparing marker to the + /// engine. After attempting to register itself as the first region to PD, + /// the preparing marker is deleted from the engine. + /// + /// On the occasion that the someone else bootstraps the first region + /// before us, the region state is cleared and `None` is returned. + pub fn bootstrap_first_region( + &mut self, + store: &Store, + store_id: u64, + ) -> Result> { + let first_region = match self.engine.get_prepare_bootstrap_region()? { + // The last bootstrap aborts. We need to resume or clean it up. + Some(r) => r, + None => { + if self.check_pd_first_region_bootstrapped()? { + // If other node has bootstrap the cluster, skip to avoid + // useless ID allocating and disk writes. + return Ok(None); + } + self.prepare_bootstrap_first_region(store_id)? + } + }; + + info!( + self.logger, + "trying to bootstrap first region"; + "store_id" => store_id, + "region" => ?first_region + ); + // cluster is not bootstrapped, and we choose first store to bootstrap + fail_point!("node_after_prepare_bootstrap_cluster", |_| Err(box_err!( + "injected error: node_after_prepare_bootstrap_cluster" + ))); + + let region_id = first_region.get_id(); + let mut retry = 0; + while retry < MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { + match self + .pd_client + .bootstrap_cluster(store.clone(), first_region.clone()) + { + Ok(_) => { + info!( + self.logger, + "bootstrap cluster ok"; + "cluster_id" => self.cluster_id + ); + fail_point!("node_after_bootstrap_cluster", |_| Err(box_err!( + "injected error: node_after_bootstrap_cluster" + ))); + self.clear_prepare_bootstrap(None)?; + return Ok(Some(first_region)); + } + Err(pd_client::Error::ClusterBootstrapped(_)) => { + match self.pd_client.get_region(b"") { + Ok(region) => { + if region == first_region { + // It is bootstrapped by us before. + self.clear_prepare_bootstrap(None)?; + return Ok(Some(first_region)); + } else { + info!( + self.logger, + "cluster is already bootstrapped"; + "cluster_id" => self.cluster_id + ); + self.clear_prepare_bootstrap(Some(region_id))?; + return Ok(None); + } + } + Err(e) => { + warn!(self.logger, "get the first region failed"; "err" => ?e); + } + } + } + Err(e) => { + error!( + self.logger, + "bootstrap cluster failed once"; + "cluster_id" => self.cluster_id, + "err" => ?e, + "err_code" => %e.error_code() + ); + } + } + retry += 1; + thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); + } + Err(box_err!( + "bootstrapped cluster failed after {} attempts", + retry + )) + } +} diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs new file mode 100644 index 00000000000..08d7f7946ec --- /dev/null +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -0,0 +1,163 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use batch_system::{Fsm, FsmScheduler, Mailbox}; +use crossbeam::channel::TryRecvError; +use engine_traits::{FlushState, KvEngine, TabletRegistry}; +use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; +use kvproto::{metapb, raft_serverpb::RegionLocalState}; +use pd_client::BucketStat; +use raftstore::{ + coprocessor::CoprocessorHost, + store::{Config, ReadTask}, +}; +use slog::Logger; +use sst_importer::SstImporter; +use tikv_util::{ + mpsc::future::{self, Receiver, Sender, WakePolicy}, + timer::GLOBAL_TIMER_HANDLE, + worker::Scheduler, +}; + +use crate::{ + operation::{CatchUpLogs, DataTrace}, + raft::Apply, + router::{ApplyRes, ApplyTask, PeerMsg}, +}; + +/// A trait for reporting apply result. +/// +/// Using a trait to make signiture simpler. +pub trait ApplyResReporter { + fn report(&self, apply_res: ApplyRes); + + fn redirect_catch_up_logs(&self, c: CatchUpLogs); +} + +impl, S: FsmScheduler> ApplyResReporter for Mailbox { + fn report(&self, apply_res: ApplyRes) { + // TODO: check shutdown. + let _ = self.force_send(PeerMsg::ApplyRes(apply_res)); + } + + fn redirect_catch_up_logs(&self, c: CatchUpLogs) { + let msg = PeerMsg::RedirectCatchUpLogs(c); + let _ = self.force_send(msg); + } +} + +/// Schedule task to `ApplyFsm`. +#[derive(Clone)] +pub struct ApplyScheduler { + sender: Sender, +} + +impl ApplyScheduler { + #[inline] + pub fn send(&self, task: ApplyTask) { + // TODO: ignore error when shutting down. + self.sender.send(task).unwrap(); + } +} + +pub struct ApplyFsm { + apply: Apply, + receiver: Receiver, +} + +impl ApplyFsm { + pub fn new( + cfg: &Config, + peer: metapb::Peer, + region_state: RegionLocalState, + res_reporter: R, + tablet_registry: TabletRegistry, + read_scheduler: Scheduler>, + flush_state: Arc, + log_recovery: Option>, + applied_term: u64, + buckets: Option, + sst_importer: Arc, + coprocessor_host: CoprocessorHost, + logger: Logger, + ) -> (ApplyScheduler, Self) { + let (tx, rx) = future::unbounded(WakePolicy::Immediately); + let apply = Apply::new( + cfg, + peer, + region_state, + res_reporter, + tablet_registry, + read_scheduler, + flush_state, + log_recovery, + applied_term, + buckets, + sst_importer, + coprocessor_host, + logger, + ); + ( + ApplyScheduler { sender: tx }, + Self { + apply, + receiver: rx, + }, + ) + } +} + +impl ApplyFsm { + pub async fn handle_all_tasks(&mut self) { + loop { + let timeout = GLOBAL_TIMER_HANDLE + .delay(Instant::now() + Duration::from_secs(10)) + .compat(); + let res = futures::select! { + res = self.receiver.next().fuse() => res, + _ = timeout.fuse() => None, + }; + self.apply.on_start_apply(); + let mut task = match res { + Some(r) => r, + None => { + self.apply.release_memory(); + match self.receiver.next().await { + Some(t) => t, + None => return, + } + } + }; + loop { + match task { + // TODO: flush by buffer size. + ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, + ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), + ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), + ApplyTask::ManualFlush => self.apply.on_manual_flush().await, + ApplyTask::RefreshBucketStat(bucket_meta) => { + self.apply.on_refresh_buckets(bucket_meta) + } + ApplyTask::CaptureApply(capture_change) => { + self.apply.on_capture_apply(capture_change) + } + } + + self.apply.maybe_flush().await; + + // Perhaps spin sometime? + match self.receiver.try_recv() { + Ok(t) => task = t, + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return, + } + } + let written_bytes = self.apply.flush(); + self.apply.maybe_reschedule(written_bytes).await; + } + } +} diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs new file mode 100644 index 00000000000..b3d0e0483ba --- /dev/null +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -0,0 +1,14 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! FSM is short for finite state machine. There are three types of FSMs, +//! - StoreFsm, used for handling control messages and global initialization. +//! - PeerFsm, used for handling messages specific for one raft peer. +//! - ApplyFsm, used for handling apply task for one raft peer. + +mod apply; +mod peer; +mod store; + +pub use apply::{ApplyFsm, ApplyResReporter, ApplyScheduler}; +pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; +pub use store::{Store, StoreFsm, StoreFsmDelegate, StoreMeta}; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs new file mode 100644 index 00000000000..3af66c4f81c --- /dev/null +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -0,0 +1,389 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the peer implementation for batch system. + +use std::borrow::Cow; + +use batch_system::{BasicMailbox, Fsm}; +use crossbeam::channel::TryRecvError; +use encryption_export::DataKeyManager; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; +use raftstore::store::{Config, TabletSnapManager, Transport}; +use slog::{debug, info, trace, Logger}; +use tikv_util::{ + is_zero_duration, + mpsc::{self, LooseBoundedSender, Receiver}, + slog_panic, + time::{duration_to_sec, Instant}, +}; + +use crate::{ + batch::StoreContext, + raft::{Peer, Storage}, + router::{PeerMsg, PeerTick, QueryResult}, + Result, +}; + +pub type SenderFsmPair = (LooseBoundedSender, Box>); + +pub struct PeerFsm { + peer: Peer, + mailbox: Option>>, + receiver: Receiver, + /// A registry for all scheduled ticks. This can avoid scheduling ticks + /// twice accidentally. + tick_registry: [bool; PeerTick::VARIANT_COUNT], + is_stopped: bool, +} + +impl PeerFsm { + pub fn new( + cfg: &Config, + tablet_registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, + snap_mgr: &TabletSnapManager, + storage: Storage, + ) -> Result> { + let peer = Peer::new(cfg, tablet_registry, key_manager, snap_mgr, storage)?; + info!(peer.logger, "create peer"; + "raft_state" => ?peer.storage().raft_state(), + "apply_state" => ?peer.storage().apply_state(), + "region_state" => ?peer.storage().region_state() + ); + let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); + let fsm = Box::new(PeerFsm { + peer, + mailbox: None, + receiver: rx, + tick_registry: [false; PeerTick::VARIANT_COUNT], + is_stopped: false, + }); + Ok((tx, fsm)) + } + + #[inline] + pub fn peer(&self) -> &Peer { + &self.peer + } + + #[inline] + pub fn peer_mut(&mut self) -> &mut Peer { + &mut self.peer + } + + #[inline] + pub fn logger(&self) -> &Logger { + &self.peer.logger + } + + /// Fetches messages to `peer_msg_buf`. It will stop when the buffer + /// capacity is reached or there is no more pending messages. + /// + /// Returns how many messages are fetched. + pub fn recv(&mut self, peer_msg_buf: &mut Vec, batch_size: usize) -> usize { + let l = peer_msg_buf.len(); + for i in l..batch_size { + match self.receiver.try_recv() { + Ok(msg) => peer_msg_buf.push(msg), + Err(e) => { + if let TryRecvError::Disconnected = e { + self.is_stopped = true; + } + return i - l; + } + } + } + batch_size - l + } +} + +impl Fsm for PeerFsm { + type Message = PeerMsg; + + #[inline] + fn is_stopped(&self) -> bool { + self.is_stopped + } + + /// Set a mailbox to FSM, which should be used to send message to itself. + fn set_mailbox(&mut self, mailbox: Cow<'_, BasicMailbox>) + where + Self: Sized, + { + self.mailbox = Some(mailbox.into_owned()); + } + + /// Take the mailbox from FSM. Implementation should ensure there will be + /// no reference to mailbox after calling this method. + fn take_mailbox(&mut self) -> Option> + where + Self: Sized, + { + self.mailbox.take() + } +} + +pub struct PeerFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { + pub fsm: &'a mut PeerFsm, + pub store_ctx: &'a mut StoreContext, +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + pub fn new(fsm: &'a mut PeerFsm, store_ctx: &'a mut StoreContext) -> Self { + Self { fsm, store_ctx } + } + + #[inline] + fn schedule_pending_ticks(&mut self) { + let pending_ticks = self.fsm.peer.take_pending_ticks(); + for tick in pending_ticks { + self.schedule_tick(tick); + } + } + + pub fn schedule_tick(&mut self, tick: PeerTick) { + assert!(PeerTick::VARIANT_COUNT <= u16::BITS as usize); + let idx = tick as usize; + if self.fsm.tick_registry[idx] { + return; + } + if is_zero_duration(&self.store_ctx.tick_batch[idx].wait_duration) { + return; + } + trace!( + self.fsm.logger(), + "schedule tick"; + "tick" => ?tick, + "timeout" => ?self.store_ctx.tick_batch[idx].wait_duration, + ); + + let region_id = self.fsm.peer.region_id(); + let mb = match self.store_ctx.router.mailbox(region_id) { + Some(mb) => mb, + None => { + if !self.fsm.peer.serving() || self.store_ctx.router.is_shutdown() { + return; + } + slog_panic!(self.fsm.logger(), "failed to get mailbox"; "tick" => ?tick); + } + }; + self.fsm.tick_registry[idx] = true; + let logger = self.fsm.logger().clone(); + // TODO: perhaps following allocation can be removed. + let cb = Box::new(move || { + // This can happen only when the peer is about to be destroyed + // or the node is shutting down. So it's OK to not to clean up + // registry. + if let Err(e) = mb.force_send(PeerMsg::Tick(tick)) { + debug!( + logger, + "failed to schedule peer tick"; + "tick" => ?tick, + "err" => %e, + ); + } + }); + self.store_ctx.tick_batch[idx].ticks.push(cb); + } + + fn on_start(&mut self) { + if !self.fsm.peer.maybe_pause_for_recovery(self.store_ctx) { + self.schedule_tick(PeerTick::Raft); + } + self.schedule_tick(PeerTick::SplitRegionCheck); + self.schedule_tick(PeerTick::PdHeartbeat); + self.schedule_tick(PeerTick::CompactLog); + if self.fsm.peer.storage().is_initialized() { + self.fsm.peer.schedule_apply_fsm(self.store_ctx); + } + self.fsm.peer.maybe_gen_approximate_buckets(self.store_ctx); + // Speed up setup if there is only one peer. + if self.fsm.peer.is_leader() { + self.fsm.peer.set_has_ready(); + } + } + + #[inline] + fn on_receive_command(&self, send_time: Instant) { + self.store_ctx + .raft_metrics + .propose_wait_time + .observe(duration_to_sec(send_time.saturating_elapsed())); + } + + fn on_tick(&mut self, tick: PeerTick) { + self.fsm.tick_registry[tick as usize] = false; + match tick { + PeerTick::Raft => self.on_raft_tick(), + PeerTick::PdHeartbeat => self.on_pd_heartbeat(), + PeerTick::CompactLog => self.on_compact_log_tick(false), + PeerTick::SplitRegionCheck => self.on_split_region_check(), + PeerTick::CheckMerge => self.fsm.peer_mut().on_check_merge(self.store_ctx), + PeerTick::CheckPeerStaleState => unimplemented!(), + PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), + PeerTick::CheckLeaderLease => unimplemented!(), + PeerTick::ReactivateMemoryLock => { + self.fsm.peer.on_reactivate_memory_lock_tick(self.store_ctx) + } + PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), + PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), + PeerTick::GcPeer => self.fsm.peer_mut().on_gc_peer_tick(self.store_ctx), + } + } + + pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { + for msg in peer_msgs_buf.drain(..) { + match msg { + PeerMsg::RaftMessage(msg) => { + self.fsm.peer.on_raft_message(self.store_ctx, msg); + } + PeerMsg::RaftQuery(cmd) => { + self.on_receive_command(cmd.send_time); + self.on_query(cmd.request, cmd.ch) + } + PeerMsg::AdminCommand(cmd) => { + self.on_receive_command(cmd.send_time); + self.fsm + .peer_mut() + .on_admin_command(self.store_ctx, cmd.request, cmd.ch) + } + PeerMsg::SimpleWrite(write) => { + self.on_receive_command(write.send_time); + self.fsm.peer_mut().on_simple_write( + self.store_ctx, + write.header, + write.data, + write.ch, + ); + } + PeerMsg::UnsafeWrite(write) => { + self.on_receive_command(write.send_time); + self.fsm + .peer_mut() + .on_unsafe_write(self.store_ctx, write.data); + } + PeerMsg::Tick(tick) => self.on_tick(tick), + PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), + PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), + PeerMsg::SplitInitFinish(region_id) => { + self.fsm.peer.on_split_init_finish(region_id) + } + PeerMsg::Start => self.on_start(), + PeerMsg::Noop => unimplemented!(), + PeerMsg::Persisted { + peer_id, + ready_number, + } => self + .fsm + .peer_mut() + .on_persisted(self.store_ctx, peer_id, ready_number), + PeerMsg::LogsFetched(fetched_logs) => { + self.fsm.peer_mut().on_logs_fetched(fetched_logs) + } + PeerMsg::SnapshotGenerated(snap_res) => { + self.fsm.peer_mut().on_snapshot_generated(snap_res) + } + PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + } => { + self.fsm + .peer_mut() + .on_data_flushed(cf, tablet_index, flushed_index); + } + PeerMsg::PeerUnreachable { to_peer_id } => { + self.fsm.peer_mut().on_peer_unreachable(to_peer_id) + } + PeerMsg::StoreUnreachable { to_store_id } => { + self.fsm.peer_mut().on_store_unreachable(to_store_id) + } + PeerMsg::SnapshotSent { to_peer_id, status } => { + self.fsm.peer_mut().on_snapshot_sent(to_peer_id, status) + } + PeerMsg::RequestSplit { request, ch } => { + self.fsm + .peer_mut() + .on_request_split(self.store_ctx, request, ch) + } + PeerMsg::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + } => self.on_refresh_region_buckets(region_epoch, buckets, bucket_ranges), + PeerMsg::RequestHalfSplit { request, ch } => self + .fsm + .peer_mut() + .on_request_half_split(self.store_ctx, request, ch), + PeerMsg::UpdateRegionSize { size } => { + self.fsm.peer_mut().on_update_region_size(size) + } + PeerMsg::UpdateRegionKeys { keys } => { + self.fsm.peer_mut().on_update_region_keys(keys) + } + PeerMsg::ClearRegionSize => self.fsm.peer_mut().on_clear_region_size(), + PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), + PeerMsg::TabletTrimmed { tablet_index } => { + self.fsm.peer_mut().on_tablet_trimmed(tablet_index) + } + PeerMsg::CleanupImportSst(ssts) => self + .fsm + .peer_mut() + .on_cleanup_import_sst(self.store_ctx, ssts), + PeerMsg::AskCommitMerge(req) => { + self.fsm.peer_mut().on_ask_commit_merge(self.store_ctx, req) + } + PeerMsg::AckCommitMerge { index, target_id } => { + self.fsm.peer_mut().on_ack_commit_merge(index, target_id) + } + PeerMsg::RejectCommitMerge { index } => { + self.fsm.peer_mut().on_reject_commit_merge(index) + } + PeerMsg::RedirectCatchUpLogs(c) => self + .fsm + .peer_mut() + .on_redirect_catch_up_logs(self.store_ctx, c), + PeerMsg::CatchUpLogs(c) => self.fsm.peer_mut().on_catch_up_logs(self.store_ctx, c), + PeerMsg::CaptureChange(capture_change) => self.on_capture_change(capture_change), + PeerMsg::LeaderCallback(ch) => self.on_leader_callback(ch), + #[cfg(feature = "testexport")] + PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), + } + } + // TODO: instead of propose pending commands immediately, we should use timeout. + self.fsm.peer.propose_pending_writes(self.store_ctx); + self.schedule_pending_ticks(); + } +} + +impl Drop for PeerFsm { + fn drop(&mut self) { + self.peer_mut().pending_reads_mut().clear_all(None); + + let region_id = self.peer().region_id(); + + let build_resp = || { + let mut err = errorpb::Error::default(); + err.set_message("region is not found".to_owned()); + err.mut_region_not_found().set_region_id(region_id); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + resp + }; + while let Ok(msg) = self.receiver.try_recv() { + match msg { + // Only these messages need to be responded explicitly as they rely on + // deterministic response. + PeerMsg::RaftQuery(query) => { + query.ch.set_result(QueryResult::Response(build_resp())); + } + PeerMsg::SimpleWrite(w) => { + w.ch.set_result(build_resp()); + } + _ => continue, + } + } + } +} diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs new file mode 100644 index 00000000000..e9b224b7375 --- /dev/null +++ b/components/raftstore-v2/src/fsm/store.rs @@ -0,0 +1,294 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::BTreeMap, + ops::Bound::{Excluded, Unbounded}, + time::{Duration, SystemTime}, +}; + +use batch_system::Fsm; +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; +use futures::{compat::Future01CompatExt, FutureExt}; +use keys::{data_end_key, data_key}; +use kvproto::metapb::Region; +use raftstore::store::{ + fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, Transport, +}; +use slog::{info, o, Logger}; +use tikv_util::{ + future::poll_future_notify, + is_zero_duration, + log::SlogFormat, + mpsc::{self, LooseBoundedSender, Receiver}, + slog_panic, +}; + +use crate::{ + batch::StoreContext, + operation::ReadDelegatePair, + router::{StoreMsg, StoreTick}, +}; + +pub struct StoreMeta { + pub store_id: u64, + /// region_id -> reader + pub readers: HashMap>, + /// region_id -> `RegionReadProgress` + pub region_read_progress: RegionReadProgressRegistry, + /// (region_end_key, epoch.version) -> region_id + /// + /// Unlinke v1, ranges in v2 may be overlapped. So we use version + /// to avoid end key conflict. + pub(crate) region_ranges: BTreeMap<(Vec, u64), u64>, + /// region_id -> (region, initialized) + pub(crate) regions: HashMap, +} + +impl StoreMeta { + pub fn new(store_id: u64) -> Self { + Self { + store_id, + readers: HashMap::default(), + region_read_progress: RegionReadProgressRegistry::default(), + region_ranges: BTreeMap::default(), + regions: HashMap::default(), + } + } + + pub fn set_region(&mut self, region: &Region, initialized: bool, logger: &Logger) { + let region_id = region.get_id(); + let version = region.get_region_epoch().get_version(); + let prev = self + .regions + .insert(region_id, (region.clone(), initialized)); + // `prev` only makes sense when it's initialized. + if let Some((prev, prev_init)) = prev && prev_init { + assert!(initialized, "{} region corrupted", SlogFormat(logger)); + if prev.get_region_epoch().get_version() != version { + let prev_id = self.region_ranges.remove(&(data_end_key(prev.get_end_key()), prev.get_region_epoch().get_version())); + assert_eq!(prev_id, Some(region_id), "{} region corrupted", SlogFormat(logger)); + } else { + assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{} region corrupted", SlogFormat(logger)); + return; + } + } + if initialized { + assert!( + self.region_ranges + .insert((data_end_key(region.get_end_key()), version), region_id) + .is_none(), + "{} region corrupted", + SlogFormat(logger) + ); + } + } + + pub fn remove_region(&mut self, region_id: u64) { + let prev = self.regions.remove(®ion_id); + if let Some((prev, initialized)) = prev { + if initialized { + let key = ( + data_end_key(prev.get_end_key()), + prev.get_region_epoch().get_version(), + ); + let prev_id = self.region_ranges.remove(&key); + assert_eq!(prev_id, Some(prev.get_id())); + } + } + } +} + +impl StoreRegionMeta for StoreMeta { + #[inline] + fn store_id(&self) -> u64 { + self.store_id + } + + #[inline] + fn region_read_progress(&self) -> &RegionReadProgressRegistry { + &self.region_read_progress + } + + #[inline] + fn search_region( + &self, + start_key: &[u8], + end_key: &[u8], + mut visitor: impl FnMut(&kvproto::metapb::Region), + ) { + let start_key = data_key(start_key); + for (_, id) in self + .region_ranges + .range((Excluded((start_key, 0)), Unbounded::<(Vec, u64)>)) + { + let (region, initialized) = &self.regions[id]; + if !initialized { + continue; + } + if end_key.is_empty() || end_key > region.get_start_key() { + visitor(region); + } else { + break; + } + } + } + + #[inline] + fn reader(&self, region_id: u64) -> Option<&ReadDelegate> { + self.readers.get(®ion_id).map(|e| &e.0) + } +} + +pub struct Store { + id: u64, + // Unix time when it's started. + start_time: Option, + logger: Logger, +} + +impl Store { + pub fn new(id: u64, logger: Logger) -> Store { + Store { + id, + start_time: None, + logger: logger.new(o!("store_id" => id)), + } + } + + pub fn store_id(&self) -> u64 { + self.id + } + + pub fn start_time(&self) -> Option { + self.start_time + } + + pub fn logger(&self) -> &Logger { + &self.logger + } +} + +pub struct StoreFsm { + pub store: Store, + receiver: Receiver, +} + +impl StoreFsm { + pub fn new( + cfg: &Config, + store_id: u64, + logger: Logger, + ) -> (LooseBoundedSender, Box) { + let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); + let fsm = Box::new(StoreFsm { + store: Store::new(store_id, logger), + receiver: rx, + }); + (tx, fsm) + } + + /// Fetches messages to `store_msg_buf`. It will stop when the buffer + /// capacity is reached or there is no more pending messages. + /// + /// Returns how many messages are fetched. + pub fn recv(&self, store_msg_buf: &mut Vec, batch_size: usize) -> usize { + let l = store_msg_buf.len(); + for i in l..batch_size { + match self.receiver.try_recv() { + Ok(msg) => store_msg_buf.push(msg), + Err(_) => return i - l, + } + } + batch_size - l + } +} + +impl Fsm for StoreFsm { + type Message = StoreMsg; + + #[inline] + fn is_stopped(&self) -> bool { + false + } +} + +pub struct StoreFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { + pub fsm: &'a mut StoreFsm, + pub store_ctx: &'a mut StoreContext, +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + pub fn new(fsm: &'a mut StoreFsm, store_ctx: &'a mut StoreContext) -> Self { + Self { fsm, store_ctx } + } + + fn on_start(&mut self) { + if self.fsm.store.start_time.is_some() { + slog_panic!(self.fsm.store.logger, "store is already started"); + } + + self.fsm.store.start_time = Some( + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ); + + self.on_pd_store_heartbeat(); + self.schedule_tick( + StoreTick::CleanupImportSst, + self.store_ctx.cfg.cleanup_import_sst_interval.0, + ); + } + + pub fn schedule_tick(&mut self, tick: StoreTick, timeout: Duration) { + if !is_zero_duration(&timeout) { + let mb = self.store_ctx.router.control_mailbox(); + let logger = self.fsm.store.logger().clone(); + let delay = self.store_ctx.timer.delay(timeout).compat().map(move |_| { + if let Err(e) = mb.force_send(StoreMsg::Tick(tick)) { + info!( + logger, + "failed to schedule store tick, are we shutting down?"; + "tick" => ?tick, + "err" => ?e + ); + } + }); + poll_future_notify(delay); + } + } + + fn on_tick(&mut self, tick: StoreTick) { + match tick { + StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat(), + StoreTick::CleanupImportSst => self.on_cleanup_import_sst(), + _ => unimplemented!(), + } + } + + pub fn handle_msgs(&mut self, store_msg_buf: &mut Vec) + where + T: Transport, + { + for msg in store_msg_buf.drain(..) { + match msg { + StoreMsg::Start => self.on_start(), + StoreMsg::Tick(tick) => self.on_tick(tick), + StoreMsg::RaftMessage(msg) => self.fsm.store.on_raft_message(self.store_ctx, msg), + StoreMsg::SplitInit(msg) => self.fsm.store.on_split_init(self.store_ctx, msg), + StoreMsg::StoreUnreachable { to_store_id } => self + .fsm + .store + .on_store_unreachable(self.store_ctx, to_store_id), + StoreMsg::AskCommitMerge(req) => { + self.fsm.store.on_ask_commit_merge(self.store_ctx, req) + } + #[cfg(feature = "testexport")] + StoreMsg::WaitFlush { region_id, ch } => { + self.fsm.store.on_wait_flush(self.store_ctx, region_id, ch) + } + } + } + } +} diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs new file mode 100644 index 00000000000..bcfaf383024 --- /dev/null +++ b/components/raftstore-v2/src/lib.rs @@ -0,0 +1,48 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! Raftstore is the place where we implement multi-raft. +//! +//! The thread module of raftstore is batch-system, more check +//! components/batch-system. All state machines are defined in [`fsm`] module. +//! Everything that wrapping raft is implemented in [`raft`] module. And the +//! commands, including split/merge/confchange/read/write, are implemented in +//! [`operation`] module. All state machines are expected to communicate with +//! messages. They are defined in [`router`] module. + +// You may get confused about the peer, or other structs like apply, in fsm and +// peer in raft module. The guideline is that if any field doesn't depend on +// the details of batch system, then it should be defined for peer in raft +// module. +// +// If we change to other concurrent programming solution, we can easily just +// change the peer in fsm. +// +// Any accessors should be defined in the file where the struct is defined. +// Functionalities like read, write, etc should be implemented in [`operation`] +// using a standalone modules. + +#![feature(let_chains)] +#![feature(array_windows)] +#![feature(div_duration)] +#![feature(box_into_inner)] +#![feature(assert_matches)] +#![feature(option_get_or_insert_default)] + +mod batch; +mod bootstrap; +mod fsm; +mod operation; +mod raft; +pub mod router; +mod worker; + +pub(crate) use batch::StoreContext; +pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; +pub use bootstrap::Bootstrap; +pub use fsm::StoreMeta; +pub use operation::{write_initial_states, SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; +pub use raftstore::{store::Config, Error, Result}; +pub use worker::{ + pd::{PdReporter, Task as PdTask}, + tablet::Task as TabletTask, +}; diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs new file mode 100644 index 00000000000..317ed89ef8d --- /dev/null +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -0,0 +1,358 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the interactions with bucket. + +use std::sync::Arc; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::metapb::{self, RegionEpoch}; +use pd_client::{BucketMeta, BucketStat}; +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, +}; +use slog::{error, warn}; + +use crate::{ + batch::StoreContext, + fsm::PeerFsmDelegate, + raft::Peer, + router::{ApplyTask, PeerTick}, + worker::pd, +}; + +#[derive(Debug, Clone, Default)] +pub struct BucketStatsInfo { + bucket_stat: Option, + // the last buckets records the stats that the recently refreshed. + last_bucket_stat: Option, + // the report bucket stat records the increment stats after last report pd. + // it will be reset after report pd. + report_bucket_stat: Option, +} + +impl BucketStatsInfo { + /// returns all bucket ranges those's write_bytes exceed the given + /// diff_size_threshold. + pub fn gen_bucket_range_for_update( + &self, + diff_size_threshold: u64, + ) -> Option> { + let region_buckets = self.bucket_stat.as_ref()?; + let stats = ®ion_buckets.stats; + let keys = ®ion_buckets.meta.keys; + + let empty_last_keys = vec![]; + let empty_last_stats = metapb::BucketStats::default(); + let (last_keys, last_stats, stats_reset) = self + .last_bucket_stat + .as_ref() + .map(|b| { + ( + &b.meta.keys, + &b.stats, + region_buckets.create_time != b.create_time, + ) + }) + .unwrap_or((&empty_last_keys, &empty_last_stats, false)); + + let mut bucket_ranges = vec![]; + let mut j = 0; + assert_eq!(keys.len(), stats.write_bytes.len() + 1); + for i in 0..stats.write_bytes.len() { + let mut diff_in_bytes = stats.write_bytes[i]; + while j < last_keys.len() && keys[i] > last_keys[j] { + j += 1; + } + if j < last_keys.len() && keys[i] == last_keys[j] { + if !stats_reset { + diff_in_bytes -= last_stats.write_bytes[j]; + } + j += 1; + } + if diff_in_bytes >= diff_size_threshold { + bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); + } + } + Some(bucket_ranges) + } + + #[inline] + pub fn version(&self) -> u64 { + self.bucket_stat + .as_ref() + .or(self.last_bucket_stat.as_ref()) + .map(|b| b.meta.version) + .unwrap_or_default() + } + #[inline] + pub fn add_bucket_flow(&mut self, delta: &Option) { + if let (Some(buckets), Some(report_buckets), Some(delta)) = ( + self.bucket_stat.as_mut(), + self.report_bucket_stat.as_mut(), + delta, + ) { + buckets.merge(delta); + report_buckets.merge(delta); + } + } + + #[inline] + pub fn set_bucket_stat(&mut self, buckets: Option) { + if let Some(b) = self.bucket_stat.take() { + self.last_bucket_stat = Some(b); + } + self.report_bucket_stat = buckets.clone(); + self.bucket_stat = buckets; + } + + #[inline] + pub fn clear_bucket_stat(&mut self) { + if let Some(bucket) = self.report_bucket_stat.as_mut() { + bucket.clear_stats(); + } + } + + #[inline] + pub fn report_bucket_stat(&mut self) -> BucketStat { + let current = self.report_bucket_stat.as_mut().unwrap(); + let delta = current.clone(); + current.clear_stats(); + delta + } + + #[inline] + pub fn bucket_stat(&self) -> &Option { + &self.bucket_stat + } +} + +impl Peer { + #[inline] + pub fn on_refresh_region_buckets( + &mut self, + store_ctx: &mut StoreContext, + region_epoch: RegionEpoch, + mut buckets: Vec, + bucket_ranges: Option>, + ) { + // bucket version layout + // term logical counter + // |-----------|-----------| + // high bits low bits + // term: given 10s election timeout, the 32 bit means 1362 year running time + let gen_bucket_version = |term, current_version| { + let current_version_term = current_version >> 32; + let bucket_version: u64 = if current_version_term == term { + current_version + 1 + } else { + if term > u32::MAX.into() { + error!( + self.logger, + "unexpected term {} more than u32::MAX. Bucket + version will be backward.", + term + ); + } + term << 32 + }; + bucket_version + }; + + let region = self.region(); + let current_version = self.region_buckets_info().version(); + let mut region_buckets: BucketStat; + // The region buckets reset after this region happened split or merge. + // The message should be dropped if it's epoch is lower than the regions. + // The bucket ranges is none when the region buckets is also none. + // So this condition indicates that the region buckets needs to refresh not + // renew. + if let (Some(bucket_ranges), Some(peer_region_buckets)) = + (bucket_ranges, self.region_buckets_info().bucket_stat()) + { + assert_eq!(buckets.len(), bucket_ranges.len()); + let mut meta_idx = 0; + region_buckets = peer_region_buckets.clone(); + let mut meta = (*region_buckets.meta).clone(); + if !buckets.is_empty() { + meta.version = gen_bucket_version(self.term(), current_version); + } + meta.region_epoch = region_epoch; + for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { + // the bucket ranges maybe need to split or merge not all the meta keys, so it + // needs to find the first keys. + while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { + meta_idx += 1; + } + // meta_idx can't be not the last entry (which is end key) + if meta_idx >= meta.keys.len() - 1 { + warn!( + self.logger, + "can't find the bucket key"; + "bucket_range_key" => log_wrappers::Value::key(&bucket_range.0)); + break; + } + // the bucket size is small and does not have split keys, + // then it should be merged with its left neighbor + let region_bucket_merge_size = store_ctx + .coprocessor_host + .cfg + .region_bucket_merge_size_ratio + * (store_ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); + if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { + meta.sizes[meta_idx] = bucket.size; + // the region has more than one bucket + // and the left neighbor + current bucket size is not very big + if meta.keys.len() > 2 + && meta_idx != 0 + && meta.sizes[meta_idx - 1] + bucket.size + < store_ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 + { + // bucket is too small + region_buckets.left_merge(meta_idx); + meta.left_merge(meta_idx); + continue; + } + } else { + // update size + meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; + // insert new bucket keys (split the original bucket) + for bucket_key in bucket.keys { + meta_idx += 1; + region_buckets.split(meta_idx); + meta.split(meta_idx, bucket_key); + } + } + meta_idx += 1; + } + region_buckets.meta = Arc::new(meta); + } else { + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + assert_eq!(buckets.len(), 1); + let bucket_keys = buckets.pop().unwrap().keys; + let bucket_count = bucket_keys.len() + 1; + let mut meta = BucketMeta { + region_id: self.region_id(), + region_epoch, + version: gen_bucket_version(self.term(), current_version), + keys: bucket_keys, + sizes: vec![store_ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], + }; + // padding the boundary keys and initialize the flow. + meta.keys.insert(0, region.get_start_key().to_vec()); + meta.keys.push(region.get_end_key().to_vec()); + region_buckets = BucketStat::from_meta(Arc::new(meta)); + } + + let buckets_count = region_buckets.meta.keys.len() - 1; + store_ctx.coprocessor_host.on_region_changed( + region, + RegionChangeEvent::UpdateBuckets(buckets_count), + self.state_role(), + ); + let meta = region_buckets.meta.clone(); + self.region_buckets_info_mut() + .set_bucket_stat(Some(region_buckets.clone())); + { + let mut store_meta = store_ctx.store_meta.lock().unwrap(); + if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { + reader.0.update(ReadProgress::region_buckets(meta)); + } + } + // it's possible that apply_scheduler is not initialized yet + if let Some(apply_scheduler) = self.apply_scheduler() { + apply_scheduler.send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); + } + } + + #[inline] + pub fn report_region_buckets_pd(&mut self, ctx: &StoreContext) { + let delta = self.region_buckets_info_mut().report_bucket_stat(); + let task = pd::Task::ReportBuckets(delta); + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!( + self.logger, + "failed to report buckets to pd"; + "err" => ?e, + ); + } + } + + pub fn maybe_gen_approximate_buckets(&self, ctx: &StoreContext) { + if ctx.coprocessor_host.cfg.enable_region_bucket() && self.storage().is_initialized() { + if let Err(e) = ctx + .schedulers + .split_check + .schedule(SplitCheckTask::ApproximateBuckets(self.region().clone())) + { + error!( + self.logger, + "failed to schedule check approximate buckets"; + "err" => %e, + ); + } + } + } + + // generate bucket range list to run split-check (to further split buckets) + // It will return the suspected bucket ranges whose write bytes exceed the + // threshold. + pub fn gen_bucket_range_for_update( + &self, + ctx: &StoreContext, + ) -> Option> { + if !ctx.coprocessor_host.cfg.enable_region_bucket() { + return None; + } + let bucket_update_diff_size_threshold = ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; + self.region_buckets_info() + .gen_bucket_range_for_update(bucket_update_diff_size_threshold) + } +} + +impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> +where + EK: KvEngine, + ER: RaftEngine, +{ + #[inline] + pub fn on_report_region_buckets_tick(&mut self) { + if !self.fsm.peer().is_leader() + || self + .fsm + .peer() + .region_buckets_info() + .bucket_stat() + .is_none() + { + return; + } + self.fsm.peer_mut().report_region_buckets_pd(self.store_ctx); + self.schedule_tick(PeerTick::ReportBuckets); + } + + pub fn on_refresh_region_buckets( + &mut self, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + if util::is_epoch_stale(®ion_epoch, self.fsm.peer().region().get_region_epoch()) { + error!( + self.fsm.peer().logger, + "receive a stale refresh region bucket message"; + "epoch" => ?region_epoch, + "current_epoch" => ?self.fsm.peer().region().get_region_epoch(), + ); + return; + } + self.fsm.peer_mut().on_refresh_region_buckets( + self.store_ctx, + region_epoch, + buckets, + bucket_ranges, + ); + self.schedule_tick(PeerTick::ReportBuckets); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs new file mode 100644 index 00000000000..383b54aa3b4 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -0,0 +1,547 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains processing logic of the following: +//! +//! # `CompactLog` and `EntryCacheEvict` ticks +//! +//! On region leader, periodically compacts useless Raft logs from the +//! underlying log engine, and evicts logs from entry cache if it reaches memory +//! limit. +//! +//! # `CompactLog` command +//! +//! Updates truncated index, and compacts logs if the corresponding changes have +//! been persisted in kvdb. + +use std::path::PathBuf; + +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; +use protobuf::Message; +use raftstore::{ + store::{ + fsm::new_admin_request, metrics::REGION_MAX_LOG_LAG, needs_evict_entry_cache, Transport, + WriteTask, RAFT_INIT_LOG_INDEX, + }, + Result, +}; +use slog::{debug, error, info}; +use tikv_util::{box_err, log::SlogFormat}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, PeerFsmDelegate}, + operation::AdminCmdResult, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerTick}, + worker::tablet, +}; + +#[derive(Debug)] +pub struct CompactLogContext { + skipped_ticks: usize, + approximate_log_size: u64, + last_applying_index: u64, + /// Tombstone tablets can only be destroyed when the tablet that replaces it + /// is persisted. This is a list of tablet index that awaits to be + /// persisted. When persisted_apply is advanced, we need to notify tablet + /// worker to destroy them. + tombstone_tablets_wait_index: Vec, +} + +impl CompactLogContext { + pub fn new(last_applying_index: u64) -> CompactLogContext { + CompactLogContext { + skipped_ticks: 0, + approximate_log_size: 0, + last_applying_index, + tombstone_tablets_wait_index: vec![], + } + } + + #[inline] + pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { + if self.skipped_ticks < max_skip_ticks { + self.skipped_ticks += 1; + true + } else { + false + } + } + + pub fn add_log_size(&mut self, size: u64) { + self.approximate_log_size += size; + } + + pub fn set_last_applying_index(&mut self, index: u64) { + self.last_applying_index = index; + } + + #[inline] + pub fn last_applying_index(&self) -> u64 { + self.last_applying_index + } +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + pub fn on_compact_log_tick(&mut self, force: bool) { + if !self.fsm.peer().is_leader() { + // `compact_cache_to` is called when apply, there is no need to call + // `compact_to` here, snapshot generating has already been cancelled + // when the role becomes follower. + return; + } + self.schedule_tick(PeerTick::CompactLog); + + self.fsm + .peer_mut() + .maybe_propose_compact_log(self.store_ctx, force); + + self.on_entry_cache_evict(); + } + + pub fn on_entry_cache_evict(&mut self) { + if needs_evict_entry_cache(self.store_ctx.cfg.evict_cache_on_memory_ratio) { + self.fsm + .peer_mut() + .entry_storage_mut() + .evict_entry_cache(true); + if !self.fsm.peer().entry_storage().is_entry_cache_empty() { + self.schedule_tick(PeerTick::EntryCacheEvict); + } + } + } +} + +impl Peer { + // Mirrors v1::on_raft_gc_log_tick. + fn maybe_propose_compact_log( + &mut self, + store_ctx: &mut StoreContext, + force: bool, + ) { + // As leader, we would not keep caches for the peers that didn't response + // heartbeat in the last few seconds. That happens probably because + // another TiKV is down. In this case if we do not clean up the cache, + // it may keep growing. + let drop_cache_duration = + store_ctx.cfg.raft_heartbeat_interval() + store_ctx.cfg.raft_entry_cache_life_time.0; + let cache_alive_limit = std::time::Instant::now() - drop_cache_duration; + + // Leader will replicate the compact log command to followers, + // If we use current replicated_index (like 10) as the compact index, + // when we replicate this log, the newest replicated_index will be 11, + // but we only compact the log to 10, not 11, at that time, + // the first index is 10, and replicated_index is 11, with an extra log, + // and we will do compact again with compact index 11, in cycles... + // So we introduce a threshold, if replicated index - first index > threshold, + // we will try to compact log. + // raft log entries[..............................................] + // ^ ^ + // |-----------------threshold------------ | + // first_index replicated_index + // `alive_cache_idx` is the smallest `replicated_index` of healthy up nodes. + // `alive_cache_idx` is only used to gc cache. + let applied_idx = self.entry_storage().applied_index(); + let truncated_idx = self.entry_storage().truncated_index(); + let first_idx = self.entry_storage().first_index(); + let last_idx = self.entry_storage().last_index(); + + let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); + for (peer_id, p) in self.raft_group().raft.prs().iter() { + if replicated_idx > p.matched { + replicated_idx = p.matched; + } + if self.peer_heartbeat_is_fresh(*peer_id, &cache_alive_limit) { + if alive_cache_idx > p.matched && p.matched >= truncated_idx { + alive_cache_idx = p.matched; + } else if p.matched == 0 { + // the new peer is still applying snapshot, do not compact cache now + alive_cache_idx = 0; + } + } + } + + // When an election happened or a new peer is added, replicated_idx can be 0. + if replicated_idx > 0 { + assert!( + last_idx >= replicated_idx, + "expect last index {} >= replicated index {}", + last_idx, + replicated_idx + ); + REGION_MAX_LOG_LAG.observe((last_idx - replicated_idx) as f64); + } + + // leader may call `get_term()` on the latest replicated index, so compact + // entries before `alive_cache_idx` instead of `alive_cache_idx + 1`. + self.entry_storage_mut() + .compact_entry_cache(std::cmp::min(alive_cache_idx, applied_idx + 1)); + + let mut compact_idx = if force && replicated_idx > first_idx { + replicated_idx + } else if applied_idx > first_idx + && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() + || self.compact_log_context().approximate_log_size + >= store_ctx.cfg.raft_log_gc_size_limit().0 + { + std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) + } else if replicated_idx < first_idx || last_idx - first_idx < 3 { + store_ctx.raft_metrics.raft_log_gc_skipped.reserve_log.inc(); + return; + } else if replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold + && self + .compact_log_context_mut() + .maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + { + store_ctx + .raft_metrics + .raft_log_gc_skipped + .threshold_limit + .inc(); + return; + } else { + replicated_idx + }; + assert!(compact_idx >= first_idx); + // Have no idea why subtract 1 here, but original code did this by magic. + compact_idx -= 1; + if compact_idx < first_idx { + // In case compact_idx == first_idx before subtraction. + store_ctx + .raft_metrics + .raft_log_gc_skipped + .compact_idx_too_small + .inc(); + return; + } + + // Create a compact log request and notify directly. + // TODO: move this into a function + let term = self.raft_group().raft.raft_log.term(compact_idx).unwrap(); + + let mut req = new_admin_request(self.region_id(), self.peer().clone()); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(AdminCmdType::CompactLog); + admin.mut_compact_log().set_compact_index(compact_idx); + admin.mut_compact_log().set_compact_term(term); + req.set_admin_request(admin); + + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + + self.compact_log_context_mut().skipped_ticks = 0; + } +} + +#[derive(Debug)] +pub struct CompactLogResult { + index: u64, + compact_index: u64, + compact_term: u64, +} + +impl Peer { + pub fn propose_compact_log( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + let compact_log = req.get_admin_request().get_compact_log(); + // TODO: add unit tests to cover all the message integrity checks. + if compact_log.get_compact_term() == 0 { + info!( + self.logger, + "compact term missing, skip"; + "command" => ?compact_log + ); + // old format compact log command, safe to ignore. + return Err(box_err!( + "command format is outdated, please upgrade leader" + )); + } + + let data = req.write_to_bytes().unwrap(); + self.propose(store_ctx, data) + } +} + +impl Apply { + pub fn apply_compact_log( + &mut self, + req: &AdminRequest, + index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + Ok(( + AdminResponse::default(), + AdminCmdResult::CompactLog(CompactLogResult { + index, + compact_index: req.get_compact_log().get_compact_index(), + compact_term: req.get_compact_log().get_compact_term(), + }), + )) + } +} + +impl Peer { + #[inline] + pub fn record_tombstone_tablet( + &mut self, + ctx: &StoreContext, + old_tablet: EK, + new_tablet_index: u64, + ) { + info!( + self.logger, + "record tombstone tablet"; + "prev_tablet_path" => old_tablet.path(), + "new_tablet_index" => new_tablet_index + ); + let compact_log_context = self.compact_log_context_mut(); + compact_log_context + .tombstone_tablets_wait_index + .push(new_tablet_index); + let _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::prepare_destroy( + old_tablet, + self.region_id(), + new_tablet_index, + )); + } + + #[inline] + pub fn record_tombstone_tablet_path( + &mut self, + ctx: &StoreContext, + old_tablet: PathBuf, + new_tablet_index: u64, + ) { + info!( + self.logger, + "record tombstone tablet"; + "prev_tablet_path" => old_tablet.display(), + "new_tablet_index" => new_tablet_index + ); + let compact_log_context = self.compact_log_context_mut(); + compact_log_context + .tombstone_tablets_wait_index + .push(new_tablet_index); + let _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::prepare_destroy_path( + old_tablet, + self.region_id(), + new_tablet_index, + )); + } + + /// Returns if there's any tombstone being removed. + #[inline] + pub fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { + let compact_log_context = self.compact_log_context_mut(); + let removed = compact_log_context + .tombstone_tablets_wait_index + .iter() + .take_while(|i| **i <= persisted) + .count(); + if removed > 0 { + compact_log_context + .tombstone_tablets_wait_index + .drain(..removed); + true + } else { + false + } + } + + pub fn has_pending_tombstone_tablets(&self) -> bool { + !self + .compact_log_context() + .tombstone_tablets_wait_index + .is_empty() + } + + #[inline] + pub fn record_tombstone_tablet_for_destroy( + &mut self, + ctx: &StoreContext, + task: &mut WriteTask, + ) { + assert!( + !self.has_pending_tombstone_tablets(), + "{} all tombstone should be cleared before being destroyed.", + SlogFormat(&self.logger) + ); + let tablet = match self.tablet() { + Some(tablet) => tablet.clone(), + None => return, + }; + let region_id = self.region_id(); + let applied_index = self.entry_storage().applied_index(); + let sched = ctx.schedulers.tablet.clone(); + let _ = sched.schedule(tablet::Task::prepare_destroy( + tablet, + self.region_id(), + applied_index, + )); + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet::Task::destroy(region_id, applied_index)); + })); + } + + pub fn on_apply_res_compact_log( + &mut self, + store_ctx: &mut StoreContext, + mut res: CompactLogResult, + ) { + let first_index = self.entry_storage().first_index(); + if let Some(i) = self.merge_context().and_then(|c| c.max_compact_log_index()) + && res.compact_index > i + { + info!( + self.logger, + "in merging mode, adjust compact index"; + "old_index" => res.compact_index, + "new_index" => i, + ); + res.compact_index = i; + } + if res.compact_index <= first_index { + debug!( + self.logger, + "compact index <= first index, no need to compact"; + "compact_index" => res.compact_index, + "first_index" => first_index, + ); + return; + } + assert!( + res.compact_index < self.compact_log_context().last_applying_index, + "{}: {}, {}", + SlogFormat(&self.logger), + res.compact_index, + self.compact_log_context().last_applying_index + ); + // TODO: check entry_cache_warmup_state + self.entry_storage_mut() + .compact_entry_cache(res.compact_index); + self.storage_mut() + .cancel_generating_snap_due_to_compacted(res.compact_index); + + let truncated_state = self + .entry_storage_mut() + .apply_state_mut() + .mut_truncated_state(); + let old_truncated = truncated_state.get_index(); + truncated_state.set_index(res.compact_index); + truncated_state.set_term(res.compact_term); + + let region_id = self.region_id(); + // TODO: get around this clone. + let apply_state = self.entry_storage().apply_state().clone(); + self.state_changes_mut() + .put_apply_state(region_id, res.index, &apply_state) + .unwrap(); + self.set_has_extra_write(); + + // All logs < perssited_apply will be deleted, so should check with +1. + if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() + && let Some(index) = self.compact_log_index() + { + // Raft Engine doesn't care about first index. + if let Err(e) = store_ctx + .engine + .gc(self.region_id(), 0, index, self.state_changes_mut()) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } + // Extra write set right above. + } + + let context = self.compact_log_context_mut(); + let applied = context.last_applying_index; + let total_cnt = applied - old_truncated; + let remain_cnt = applied - res.compact_index; + context.approximate_log_size = + (context.approximate_log_size as f64 * (remain_cnt as f64 / total_cnt as f64)) as u64; + } + + /// Called when apply index is persisted. + #[inline] + pub fn on_advance_persisted_apply_index( + &mut self, + store_ctx: &mut StoreContext, + old_persisted: u64, + task: &mut WriteTask, + ) { + let new_persisted = self.storage().apply_trace().persisted_apply_index(); + if old_persisted < new_persisted { + let region_id = self.region_id(); + // TODO: batch it. + // TODO: avoid allocation if there is nothing to delete. + if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( + region_id, + new_persisted, + task.extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)), + ) { + error!(self.logger, "failed to delete raft states"; "err" => ?e); + } + // If it's snapshot, logs are gc already. + if !task.has_snapshot + && old_persisted < self.entry_storage().truncated_index() + 1 + && let Some(index) = self.compact_log_index() + { + let batch = task.extra_write.ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)); + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, batch) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } + } + if self.remove_tombstone_tablets(new_persisted) { + let sched = store_ctx.schedulers.tablet.clone(); + if !task.has_snapshot { + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + })); + } else { + // In snapshot, the index is persisted, tablet can be destroyed directly. + let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + } + } + } + } + + fn compact_log_index(&mut self) -> Option { + let truncated = self.entry_storage().truncated_index() + 1; + let persisted_applied = self.storage().apply_trace().persisted_apply_index(); + let compact_index = std::cmp::min(truncated, persisted_applied); + if compact_index == RAFT_INIT_LOG_INDEX + 1 { + // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. + return None; + } + assert!( + compact_index <= self.raft_group().raft.raft_log.committed, + "{}: compact_index={}, committed={}", + SlogFormat(&self.logger), + compact_index, + self.raft_group().raft.raft_log.committed, + ); + // TODO: make this debug when stable. + info!( + self.logger, + "compact log"; + "index" => compact_index, + "apply_trace" => ?self.storage().apply_trace(), + "truncated" => ?self.entry_storage().apply_state() + ); + Some(compact_index) + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs new file mode 100644 index 00000000000..b2bea379299 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -0,0 +1,609 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the configuration change command. +//! +//! The command will go through the following steps: +//! - Propose conf change +//! - Apply after conf change is committed +//! - Update raft state using the result of conf change + +use std::time::Instant; + +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use kvproto::{ + metapb::{self, PeerRole}, + raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, + raft_serverpb::{PeerState, RegionLocalState}, +}; +use protobuf::Message; +use raft::prelude::*; +use raftstore::{ + coprocessor::{RegionChangeEvent, RegionChangeReason}, + store::{ + metrics::{PEER_ADMIN_CMD_COUNTER_VEC, PEER_PROPOSE_LOG_SIZE_HISTOGRAM}, + util::{self, ChangePeerI, ConfChangeKind}, + ProposalContext, + }, + Error, Result, +}; +use slog::{error, info, warn}; +use tikv_util::{box_err, slog_panic}; + +use super::AdminCmdResult; +use crate::{ + batch::StoreContext, + raft::{Apply, Peer}, +}; + +/// The apply result of conf change. +#[derive(Default, Debug)] +pub struct ConfChangeResult { + pub index: u64, + // The proposed ConfChangeV2 or (legacy) ConfChange. + // ConfChange (if it is) will be converted to ConfChangeV2. + pub conf_change: ConfChangeV2, + // The change peer requests come along with ConfChangeV2 + // or (legacy) ConfChange. For ConfChange, it only contains + // one element. + pub changes: Vec, + pub region_state: RegionLocalState, +} + +#[derive(Debug)] +pub struct UpdateGcPeersResult { + index: u64, + region_state: RegionLocalState, +} + +impl Peer { + #[inline] + pub fn propose_conf_change( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + if self.raft_group().raft.has_pending_conf() { + info!( + self.logger, + "there is a pending conf change, try later"; + ); + return Err(box_err!("there is a pending conf change, try later")); + } + let data = req.write_to_bytes()?; + let admin = req.get_admin_request(); + if admin.has_change_peer() { + self.propose_conf_change_imp(ctx, admin.get_change_peer(), data) + } else if admin.has_change_peer_v2() { + self.propose_conf_change_imp(ctx, admin.get_change_peer_v2(), data) + } else { + unreachable!() + } + } + + /// Fails in following cases: + /// + /// 1. A pending conf change has not been applied yet; + /// 2. Removing the leader is not allowed in the configuration; + /// 3. The conf change makes the raft group not healthy; + /// 4. The conf change is dropped by raft group internally. + /// 5. There is a same peer on the same store in history record (TODO). + fn propose_conf_change_imp( + &mut self, + ctx: &mut StoreContext, + change_peer: impl ChangePeerI, + data: Vec, + ) -> Result { + let data_size = data.len(); + let cc = change_peer.to_confchange(data); + let changes = change_peer.get_change_peers(); + + util::check_conf_change( + &ctx.cfg, + self.raft_group(), + self.region(), + self.peer(), + changes.as_ref(), + &cc, + false, + )?; + + // TODO: check if the new peer is already in history record. + + ctx.raft_metrics.propose.conf_change.inc(); + // TODO: use local histogram metrics + PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data_size as f64); + info!( + self.logger, + "propose conf change peer"; + "changes" => ?changes.as_ref(), + "kind" => ?ConfChangeKind::confchange_kind(changes.as_ref().len()), + ); + + let last_index = self.raft_group().raft.raft_log.last_index(); + self.raft_group_mut() + .propose_conf_change(ProposalContext::SYNC_LOG.to_vec(), cc)?; + let proposal_index = self.raft_group().raft.raft_log.last_index(); + if proposal_index == last_index { + // The message is dropped silently, this usually due to leader absence + // or transferring leader. Both cases can be considered as NotLeader error. + return Err(Error::NotLeader(self.region_id(), None)); + } + + Ok(proposal_index) + } + + pub fn on_apply_res_conf_change( + &mut self, + ctx: &mut StoreContext, + conf_change: ConfChangeResult, + ) { + // TODO: cancel generating snapshot. + + // Snapshot is applied in memory without waiting for all entries being + // applied. So it's possible conf_change.index < first_index. + if conf_change.index >= self.raft_group().raft.raft_log.first_index() { + match self.raft_group_mut().apply_conf_change(&conf_change.conf_change) { + Ok(_) + // PD could dispatch redundant conf changes. + | Err(raft::Error::NotExists { .. }) | Err(raft::Error::Exists { .. }) => (), + _ => unreachable!(), + } + } + + let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; + self.storage_mut() + .set_region_state(conf_change.region_state.clone()); + if self.is_leader() { + info!( + self.logger, + "notify pd with change peer region"; + "region" => ?self.region(), + ); + self.region_heartbeat_pd(ctx); + let demote_self = tikv_util::store::is_learner(self.peer()); + if remove_self || demote_self { + warn!(self.logger, "removing or demoting leader"; "remove" => remove_self, "demote" => demote_self); + let term = self.term(); + self.raft_group_mut() + .raft + .become_follower(term, raft::INVALID_ID); + } + let mut has_new_peer = None; + for c in conf_change.changes { + let peer_id = c.get_peer().get_id(); + match c.get_change_type() { + ConfChangeType::AddNode | ConfChangeType::AddLearnerNode => { + if has_new_peer.is_none() { + has_new_peer = Some(Instant::now()); + } + self.add_peer_heartbeat(peer_id, has_new_peer.unwrap()); + } + ConfChangeType::RemoveNode => { + self.remove_peer_heartbeat(peer_id); + } + } + } + if self.is_leader() { + if has_new_peer.is_some() { + // Speed up snapshot instead of waiting another heartbeat. + self.raft_group_mut().ping(); + self.set_has_ready(); + } + self.maybe_schedule_gc_peer_tick(); + } + } + ctx.store_meta + .lock() + .unwrap() + .set_region(self.region(), true, &self.logger); + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Update(RegionChangeReason::ChangePeer), + self.raft_group().raft.state, + ); + if remove_self { + // When self is destroyed, all metas will be cleaned in `start_destroy`. + self.mark_for_destroy(None); + } else { + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, conf_change.index, &conf_change.region_state) + .unwrap(); + self.set_has_extra_write(); + } + } + + pub fn on_apply_res_update_gc_peers(&mut self, result: UpdateGcPeersResult) { + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, result.index, &result.region_state) + .unwrap(); + self.set_has_extra_write(); + self.storage_mut().set_region_state(result.region_state); + } +} + +impl Apply { + #[inline] + pub fn apply_conf_change( + &mut self, + index: u64, + req: &AdminRequest, + cc: ConfChangeV2, + ) -> Result<(AdminResponse, AdminCmdResult)> { + assert!(req.has_change_peer()); + self.apply_conf_change_imp(index, std::slice::from_ref(req.get_change_peer()), cc, true) + } + + #[inline] + pub fn apply_conf_change_v2( + &mut self, + index: u64, + req: &AdminRequest, + cc: ConfChangeV2, + ) -> Result<(AdminResponse, AdminCmdResult)> { + assert!(req.has_change_peer_v2()); + self.apply_conf_change_imp( + index, + req.get_change_peer_v2().get_change_peers(), + cc, + false, + ) + } + + #[inline] + fn apply_conf_change_imp( + &mut self, + index: u64, + changes: &[ChangePeerRequest], + cc: ConfChangeV2, + legacy: bool, + ) -> Result<(AdminResponse, AdminCmdResult)> { + let region = self.region(); + let change_kind = ConfChangeKind::confchange_kind(changes.len()); + info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch(), "index" => index); + let mut new_region = region.clone(); + match change_kind { + ConfChangeKind::LeaveJoint => self.apply_leave_joint(&mut new_region), + kind => { + debug_assert!(!legacy || kind == ConfChangeKind::Simple, "{:?}", kind); + debug_assert!( + kind != ConfChangeKind::Simple || changes.len() == 1, + "{:?}", + changes + ); + for cp in changes { + let res = if legacy { + self.apply_single_change_legacy(cp, &mut new_region) + } else { + self.apply_single_change(kind, cp, &mut new_region) + }; + if let Err(e) = res { + error!(self.logger, "failed to apply conf change"; + "changes" => ?changes, + "legacy" => legacy, + "original region" => ?region, "err" => ?e); + return Err(e); + } + } + let conf_ver = region.get_region_epoch().get_conf_ver() + changes.len() as u64; + new_region.mut_region_epoch().set_conf_ver(conf_ver); + } + }; + + info!( + self.logger, + "conf change successfully"; + "changes" => ?changes, + "legacy" => legacy, + "original region" => ?region, + "current region" => ?new_region, + ); + let my_id = self.peer().get_id(); + let state = self.region_state_mut(); + let mut removed_records: Vec<_> = state.take_removed_records().into(); + for p0 in state.get_region().get_peers() { + // No matching store ID means the peer must be removed. + if new_region + .get_peers() + .iter() + .all(|p1| p1.get_store_id() != p0.get_store_id()) + { + removed_records.push(p0.clone()); + } + } + // If a peer is replaced in the same store, the leader will keep polling the + // new peer on the same store, which implies that the old peer must be + // tombstone in the end. + removed_records.retain(|p0| { + new_region + .get_peers() + .iter() + .all(|p1| p1.get_store_id() != p0.get_store_id()) + }); + state.set_region(new_region.clone()); + state.set_removed_records(removed_records.into()); + let new_peer = new_region + .get_peers() + .iter() + .find(|p| p.get_id() == my_id) + .cloned(); + if new_peer.is_none() { + // A peer will reject any snapshot that doesn't include itself in the + // configuration. So if it disappear from the configuration, it must + // be removed by conf change. + state.set_state(PeerState::Tombstone); + } + let mut resp = AdminResponse::default(); + resp.mut_change_peer().set_region(new_region); + let conf_change = ConfChangeResult { + index, + conf_change: cc, + changes: changes.to_vec(), + region_state: state.clone(), + }; + if state.get_state() == PeerState::Tombstone { + self.mark_tombstone(); + } + if let Some(peer) = new_peer { + self.set_peer(peer); + } + Ok((resp, AdminCmdResult::ConfChange(conf_change))) + } + + #[inline] + fn apply_leave_joint(&self, region: &mut metapb::Region) { + let mut change_num = 0; + for peer in region.mut_peers().iter_mut() { + match peer.get_role() { + PeerRole::IncomingVoter => peer.set_role(PeerRole::Voter), + PeerRole::DemotingVoter => peer.set_role(PeerRole::Learner), + _ => continue, + } + change_num += 1; + } + if change_num == 0 { + slog_panic!( + self.logger, + "can't leave a non-joint config"; + "region" => ?self.region_state() + ); + } + let conf_ver = region.get_region_epoch().get_conf_ver() + change_num; + region.mut_region_epoch().set_conf_ver(conf_ver); + info!(self.logger, "leave joint state successfully"; "region" => ?region); + } + + /// This is used for conf change v1. Use a standalone function to avoid + /// future refactor breaks consistency accidentally. + #[inline] + fn apply_single_change_legacy( + &self, + cp: &ChangePeerRequest, + region: &mut metapb::Region, + ) -> Result<()> { + let peer = cp.get_peer(); + let store_id = peer.get_store_id(); + let change_type = cp.get_change_type(); + + match change_type { + ConfChangeType::AddNode => { + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_peer", "all"]) + .inc(); + + let mut exists = false; + if let Some(p) = tikv_util::store::find_peer_mut(region, store_id) { + exists = true; + if !tikv_util::store::is_learner(p) || p.get_id() != peer.get_id() { + return Err(box_err!( + "can't add duplicated peer {:?} to region {:?}", + peer, + self.region_state() + )); + } else { + p.set_role(PeerRole::Voter); + } + } + if !exists { + // TODO: Do we allow adding peer in same node? + region.mut_peers().push(peer.clone()); + } + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_peer", "success"]) + .inc(); + } + ConfChangeType::RemoveNode => { + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["remove_peer", "all"]) + .inc(); + + if let Some(p) = tikv_util::store::remove_peer(region, store_id) { + // Considering `is_learner` flag in `Peer` here is by design. + if &p != peer { + return Err(box_err!( + "remove unmatched peer: expect: {:?}, get {:?}, ignore", + peer, + p + )); + } + } else { + return Err(box_err!( + "remove missing peer {:?} from region {:?}", + peer, + self.region_state() + )); + } + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["remove_peer", "success"]) + .inc(); + } + ConfChangeType::AddLearnerNode => { + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_learner", "all"]) + .inc(); + + if tikv_util::store::find_peer(region, store_id).is_some() { + return Err(box_err!( + "can't add duplicated learner {:?} to region {:?}", + peer, + self.region_state() + )); + } + region.mut_peers().push(peer.clone()); + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_learner", "success"]) + .inc(); + } + } + Ok(()) + } + + #[inline] + fn apply_single_change( + &self, + kind: ConfChangeKind, + cp: &ChangePeerRequest, + region: &mut metapb::Region, + ) -> Result<()> { + let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); + let store_id = peer.get_store_id(); + + let metric = match change_type { + ConfChangeType::AddNode => "add_peer", + ConfChangeType::RemoveNode => "remove_peer", + ConfChangeType::AddLearnerNode => "add_learner", + }; + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&[metric, "all"]) + .inc(); + + if let Some(exist_peer) = tikv_util::store::find_peer(region, store_id) { + let r = exist_peer.get_role(); + if r == PeerRole::IncomingVoter || r == PeerRole::DemotingVoter { + slog_panic!( + self.logger, + "can't apply confchange because configuration is still in joint state"; + "confchange" => ?cp, + "region_state" => ?self.region_state() + ); + } + } + match ( + tikv_util::store::find_peer_mut(region, store_id), + change_type, + ) { + (None, ConfChangeType::AddNode) => { + let mut peer = peer.clone(); + match kind { + ConfChangeKind::Simple => peer.set_role(PeerRole::Voter), + ConfChangeKind::EnterJoint => peer.set_role(PeerRole::IncomingVoter), + _ => unreachable!(), + } + region.mut_peers().push(peer); + } + (None, ConfChangeType::AddLearnerNode) => { + let mut peer = peer.clone(); + peer.set_role(PeerRole::Learner); + region.mut_peers().push(peer); + } + (None, ConfChangeType::RemoveNode) => { + return Err(box_err!( + "remove missing peer {:?} from region {:?}", + peer, + self.region_state() + )); + } + // Add node + (Some(exist_peer), ConfChangeType::AddNode) + | (Some(exist_peer), ConfChangeType::AddLearnerNode) => { + let (role, exist_id, incoming_id) = + (exist_peer.get_role(), exist_peer.get_id(), peer.get_id()); + + if exist_id != incoming_id // Add peer with different id to the same store + // The peer is already the requested role + || (role, change_type) == (PeerRole::Voter, ConfChangeType::AddNode) + || (role, change_type) == (PeerRole::Learner, ConfChangeType::AddLearnerNode) + { + return Err(box_err!( + "can't add duplicated peer {:?} to region {:?}, duplicated with exist peer {:?}", + peer, + self.region_state(), + exist_peer + )); + } + match (role, change_type) { + (PeerRole::Voter, ConfChangeType::AddLearnerNode) => match kind { + ConfChangeKind::Simple => exist_peer.set_role(PeerRole::Learner), + ConfChangeKind::EnterJoint => exist_peer.set_role(PeerRole::DemotingVoter), + _ => unreachable!(), + }, + (PeerRole::Learner, ConfChangeType::AddNode) => match kind { + ConfChangeKind::Simple => exist_peer.set_role(PeerRole::Voter), + ConfChangeKind::EnterJoint => exist_peer.set_role(PeerRole::IncomingVoter), + _ => unreachable!(), + }, + _ => unreachable!(), + } + } + // Remove node + (Some(exist_peer), ConfChangeType::RemoveNode) => { + if kind == ConfChangeKind::EnterJoint && exist_peer.get_role() == PeerRole::Voter { + return Err(box_err!( + "can not remove voter {:?} directly from region {:?}", + peer, + self.region_state() + )); + } + match tikv_util::store::remove_peer(region, store_id) { + Some(p) => { + if &p != peer { + return Err(box_err!( + "remove unmatched peer: expect: {:?}, get {:?}, ignore", + peer, + p + )); + } + } + None => unreachable!(), + } + } + } + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&[metric, "success"]) + .inc(); + Ok(()) + } + + pub fn apply_update_gc_peer( + &mut self, + log_index: u64, + admin_req: &AdminRequest, + ) -> (AdminResponse, AdminCmdResult) { + let mut removed_records: Vec<_> = self.region_state_mut().take_removed_records().into(); + let mut merged_records: Vec<_> = self.region_state_mut().take_merged_records().into(); + let updates = admin_req.get_update_gc_peers().get_peer_id(); + info!(self.logger, "update gc peer"; "index" => log_index, "updates" => ?updates, "gc_peers" => ?removed_records, "merged_peers" => ?merged_records); + removed_records.retain(|p| !updates.contains(&p.get_id())); + merged_records.retain_mut(|r| { + let mut sources: Vec<_> = r.take_source_peers().into(); + sources.retain(|p| !updates.contains(&p.get_id())); + r.set_source_peers(sources.into()); + !r.get_source_peers().is_empty() + }); + self.region_state_mut() + .set_removed_records(removed_records.into()); + self.region_state_mut() + .set_merged_records(merged_records.into()); + ( + AdminResponse::default(), + AdminCmdResult::UpdateGcPeers(UpdateGcPeersResult { + index: log_index, + region_state: self.region_state().clone(), + }), + ) + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs new file mode 100644 index 00000000000..2756d0174dd --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -0,0 +1,792 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains merge related processing logic. +//! +//! ## Propose +//! +//! The proposal is initiated by the source region. After `PrepareMerge` is +//! applied, the source peer will send an `AskCommitMerge` message to the target +//! peer. (For simplicity, we send this message regardless of whether the target +//! peer is leader.) The message will also carry some source region logs that +//! may not be committed by some source peers. +//! +//! The source region cannot serve any writes until the merge is committed or +//! rollback-ed. This is guaranteed by `MergeContext::prepare_status`. +//! +//! ## Apply (`Apply::apply_commit_merge`) +//! +//! At first, target region will not apply the `CommitMerge` command. Instead +//! the apply progress will be paused and it redirects the log entries from +//! source region, as a `CatchUpLogs` message, to the local source region peer. +//! When the source region peer has applied all logs up to the prior +//! `PrepareMerge` command, it will signal the target peer. Here we use a +//! temporary channel instead of directly sending message between apply FSMs +//! like in v1. +//! +//! Here is a complete view of the process: +//! +//! ```text +//! | Store 1 | Store 2 | +//! | Source Peer | Target Leader | Source Peer | Target Peer | +//! | +//! apply PrepareMerge +//! \ +//! +--------------+ +//! `AskCommitMerge`\ +//! \ +//! propose CommitMerge ---------------> append CommitMerge +//! apply CommitMerge apply CommitMerge +//! on apply res /| +//! /| +------------+ | +//! +---------------+ | / `CatchUpLogs` | +//! / `AckCommitMerge` | / | +//! / (complete) append logs (pause) +//! destroy self | . +//! apply PrepareMerge . +//! | . +//! +-----------> (continue) +//! | | +//! destroy self (complete) +//! ``` + +use std::{ + any::Any, + cmp, fs, io, + path::{Path, PathBuf}, +}; + +use crossbeam::channel::SendError; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry}; +use futures::channel::oneshot; +use kvproto::{ + metapb::Region, + raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CommitMergeRequest, RaftCmdRequest}, + raft_serverpb::{MergedRecord, PeerState, RegionLocalState}, +}; +use protobuf::Message; +use raft::{GetEntriesContext, Storage, INVALID_ID, NO_LIMIT}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{ + fsm::new_admin_request, metrics::PEER_ADMIN_CMD_COUNTER, util, ProposalContext, Transport, + }, + Result, +}; +use slog::{debug, error, info, Logger}; +use tikv_util::{ + config::ReadableDuration, + log::SlogFormat, + slog_panic, + store::{find_peer, region_on_same_stores}, + time::Instant, +}; + +use super::merge_source_path; +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::{AdminCmdResult, SharedReadTablet}, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, +}; + +#[derive(Debug)] +pub struct CommitMergeResult { + pub index: u64, + // Only used to respond `CatchUpLogs` to source peer. + prepare_merge_index: u64, + source_path: PathBuf, + region_state: RegionLocalState, + source: Region, + source_safe_ts: u64, + tablet: Box, +} + +#[derive(Debug)] +pub struct CatchUpLogs { + target_region_id: u64, + merge: CommitMergeRequest, + // safe_ts. + tx: oneshot::Sender, +} + +pub const MERGE_IN_PROGRESS_PREFIX: &str = "merge-in-progress"; + +struct MergeInProgressGuard(PathBuf); + +impl MergeInProgressGuard { + // `index` is the commit index of `CommitMergeRequest` + fn new( + logger: &Logger, + registry: &TabletRegistry, + target_region_id: u64, + index: u64, + tablet_path: &Path, + ) -> io::Result> { + let name = registry.tablet_name(MERGE_IN_PROGRESS_PREFIX, target_region_id, index); + let marker_path = registry.tablet_root().join(name); + if !marker_path.exists() { + if tablet_path.exists() { + return Ok(None); + } else { + fs::create_dir(&marker_path)?; + file_system::sync_dir(marker_path.parent().unwrap())?; + } + } else if tablet_path.exists() { + info!(logger, "remove incomplete merged tablet"; "path" => %tablet_path.display()); + fs::remove_dir_all(tablet_path)?; + } + Ok(Some(Self(marker_path))) + } + + fn defuse(self) -> io::Result<()> { + fs::remove_dir(&self.0)?; + file_system::sync_dir(self.0.parent().unwrap()) + } +} + +fn commit_of_merge(r: &CommitMergeRequest) -> u64 { + r.get_source_state().get_merge_state().get_commit() +} + +// Source peer initiates commit merge on target peer. +impl Peer { + // Called after applying `PrepareMerge`. + pub fn start_commit_merge(&mut self, store_ctx: &mut StoreContext) { + assert!(self.applied_merge_state().is_some()); + // Target already committed `CommitMerge`. + if let Some(c) = &self.merge_context().unwrap().catch_up_logs { + if self.catch_up_logs_ready(c) { + let c = self.merge_context_mut().catch_up_logs.take().unwrap(); + self.finish_catch_up_logs(store_ctx, c); + } + } else { + self.on_check_merge(store_ctx); + } + } + + // Match v1::on_check_merge. + pub fn on_check_merge(&mut self, store_ctx: &mut StoreContext) { + if !self.serving() || self.applied_merge_state().is_none() { + return; + } + self.add_pending_tick(PeerTick::CheckMerge); + self.ask_target_peer_to_commit_merge(store_ctx); + } + + // Match v1::schedule_merge. + fn ask_target_peer_to_commit_merge(&mut self, store_ctx: &mut StoreContext) { + let state = self.applied_merge_state().unwrap(); + let target = state.get_target(); + let target_id = target.get_id(); + + let (min_index, _) = self.calculate_min_progress().unwrap(); + let low = cmp::max(min_index + 1, state.get_min_index()); + // TODO: move this into raft module. + // > over >= to include the PrepareMerge proposal. + let entries = if low > state.get_commit() { + Vec::new() + } else { + // TODO: fetch entries in async way + match self.storage().entries( + low, + state.get_commit() + 1, + NO_LIMIT, + GetEntriesContext::empty(false), + ) { + Ok(ents) => ents, + Err(e) => slog_panic!( + self.logger, + "failed to get merge entires"; + "err" => ?e, + "low" => low, + "commit" => state.get_commit() + ), + } + }; + + let target_peer = find_peer(target, store_ctx.store_id).unwrap(); + let mut request = new_admin_request(target.get_id(), target_peer.clone()); + request + .mut_header() + .set_region_epoch(target.get_region_epoch().clone()); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(AdminCmdType::CommitMerge); + admin.mut_commit_merge().set_entries(entries.into()); + admin + .mut_commit_merge() + .set_source_state(self.storage().region_state().clone()); + request.set_admin_request(admin); + // Please note that, here assumes that the unit of network isolation is store + // rather than peer. So a quorum stores of source region should also be the + // quorum stores of target region. Otherwise we need to enable proposal + // forwarding. + let msg = PeerMsg::AskCommitMerge(request); + // If target peer is destroyed, life.rs is responsible for telling us to + // rollback. + match store_ctx.router.force_send(target_id, msg) { + Ok(_) => (), + Err(SendError(PeerMsg::AskCommitMerge(msg))) => { + if let Err(e) = store_ctx + .router + .force_send_control(StoreMsg::AskCommitMerge(msg)) + { + if store_ctx.router.is_shutdown() { + return; + } + slog_panic!( + self.logger, + "fails to send `AskCommitMerge` msg to store"; + "error" => ?e, + ); + } + } + _ => unreachable!(), + } + } +} + +// Target peer handles the commit merge request. +impl Peer { + pub fn on_ask_commit_merge( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) { + match self.validate_commit_merge(&req) { + Some(true) if self.is_leader() => { + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + } + Some(false) => { + let commit_merge = req.get_admin_request().get_commit_merge(); + let source_id = commit_merge.get_source_state().get_region().get_id(); + let _ = store_ctx.router.force_send( + source_id, + PeerMsg::RejectCommitMerge { + index: commit_of_merge(commit_merge), + }, + ); + } + _ => (), + } + } + + fn validate_commit_merge(&self, req: &RaftCmdRequest) -> Option { + let expected_epoch = req.get_header().get_region_epoch(); + let merge = req.get_admin_request().get_commit_merge(); + assert!(merge.has_source_state() && merge.get_source_state().has_merge_state()); + let source_region = merge.get_source_state().get_region(); + let region = self.region(); + if self + .storage() + .region_state() + .get_merged_records() + .iter() + .any(|p| p.get_source_region_id() == source_region.get_id()) + { + info!( + self.logger, + "ignore commit merge because peer is already in merged_records"; + "source" => ?source_region, + ); + None + } else if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { + info!( + self.logger, + "reject commit merge because of stale"; + "current_epoch" => ?region.get_region_epoch(), + "expected_epoch" => ?expected_epoch, + ); + Some(false) + } else if expected_epoch == region.get_region_epoch() { + assert!( + util::is_sibling_regions(source_region, region), + "{}: {:?}, {:?}", + SlogFormat(&self.logger), + source_region, + region + ); + assert!( + region_on_same_stores(source_region, region), + "{:?}, {:?}", + source_region, + region + ); + // Best effort. Remove when trim check is implemented. + if self.storage().has_dirty_data() { + info!(self.logger, "ignore commit merge because of dirty data"); + None + } else { + Some(true) + } + } else { + info!( + self.logger, + "ignore commit merge because self epoch is stale"; + "source" => ?source_region, + ); + None + } + } + + pub fn propose_commit_merge( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + let mut proposal_ctx = ProposalContext::empty(); + proposal_ctx.insert(ProposalContext::COMMIT_MERGE); + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + } +} + +impl Apply { + // Match v1::exec_commit_merge. + pub async fn apply_commit_merge( + &mut self, + req: &AdminRequest, + index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.commit_merge.all.inc(); + + self.flush(); + + // Note: compared to v1, doesn't validate region state from kvdb any more. + let reg = self.tablet_registry(); + let merge = req.get_commit_merge(); + let merge_commit = commit_of_merge(merge); + let source_state = merge.get_source_state(); + let source_region = source_state.get_region(); + let source_path = merge_source_path(reg, source_region.get_id(), merge_commit); + let mut source_safe_ts = 0; + + let mut start_time = Instant::now_coarse(); + let mut wait_duration = None; + let force_send = (|| { + fail::fail_point!("force_send_catch_up_logs", |_| true); + false + })(); + if !source_path.exists() || force_send { + let (tx, rx) = oneshot::channel(); + self.res_reporter().redirect_catch_up_logs(CatchUpLogs { + target_region_id: self.region_id(), + merge: merge.clone(), + tx, + }); + match rx.await { + Ok(ts) => { + source_safe_ts = ts; + } + Err(_) => { + if tikv_util::thread_group::is_shutdown(!cfg!(test)) { + return futures::future::pending().await; + } else { + slog_panic!( + self.logger, + "source peer is missing when getting checkpoint for merge" + ); + } + } + } + let now = Instant::now_coarse(); + wait_duration = Some(now.saturating_duration_since(start_time)); + start_time = now; + }; + fail::fail_point!("after_acquire_source_checkpoint", |_| Err( + tikv_util::box_err!("fp") + )); + + info!( + self.logger, + "execute CommitMerge"; + "commit" => merge_commit, + "entries" => merge.get_entries().len(), + "index" => index, + "source_region" => ?source_region, + ); + + let ctx = TabletContext::new(source_region, None); + let source_tablet = reg + .tablet_factory() + .open_tablet(ctx, &source_path) + .unwrap_or_else(|e| { + slog_panic!(self.logger, "failed to open source checkpoint"; "err" => ?e); + }); + let open_time = Instant::now_coarse(); + + let mut region = self.region().clone(); + // Use a max value so that pd can ensure overlapped region has a priority. + let version = cmp::max( + source_region.get_region_epoch().get_version(), + region.get_region_epoch().get_version(), + ) + 1; + region.mut_region_epoch().set_version(version); + if keys::enc_end_key(®ion) == keys::enc_start_key(source_region) { + region.set_end_key(source_region.get_end_key().to_vec()); + } else { + region.set_start_key(source_region.get_start_key().to_vec()); + } + + let path = reg.tablet_path(self.region_id(), index); + + // Avoid seqno jump back between self.tablet and the newly created tablet. + // If we are recovering, this flush would just be a noop. + self.tablet().flush_cfs(&[], true).unwrap(); + let flush_time = Instant::now_coarse(); + + let mut ctx = TabletContext::new(®ion, Some(index)); + ctx.flush_state = Some(self.flush_state().clone()); + let guard = MergeInProgressGuard::new(&self.logger, reg, self.region_id(), index, &path) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create MergeInProgressGuard"; + "path" => %path.display(), + "error" => ?e + ) + }); + let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); + if let Some(guard) = guard { + tablet + .merge(&[&source_tablet, self.tablet()]) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to merge tablet"; + "path" => %path.display(), + "error" => ?e + ) + }); + guard.defuse().unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to defuse MergeInProgressGuard"; + "path" => %path.display(), + "error" => ?e + ) + }); + } else { + info!(self.logger, "reuse merged tablet"); + } + let merge_time = Instant::now_coarse(); + fail::fail_point!("after_merge_source_checkpoint", |_| Err( + tikv_util::box_err!("fp") + )); + + info!( + self.logger, + "applied CommitMerge"; + "source_region" => ?source_region, + "wait" => ?wait_duration.map(|d| format!("{}", ReadableDuration(d))), + "open" => %ReadableDuration(open_time.saturating_duration_since(start_time)), + "merge" => %ReadableDuration(flush_time.saturating_duration_since(open_time)), + "flush" => %ReadableDuration(merge_time.saturating_duration_since(flush_time)), + ); + + self.set_tablet(tablet.clone()); + + let state = self.region_state_mut(); + state.set_region(region.clone()); + state.set_state(PeerState::Normal); + assert!(!state.has_merge_state()); + state.set_tablet_index(index); + let mut removed_records: Vec<_> = state.take_removed_records().into(); + removed_records.append(&mut source_state.get_removed_records().into()); + state.set_removed_records(removed_records.into()); + let mut merged_records: Vec<_> = state.take_merged_records().into(); + merged_records.append(&mut source_state.get_merged_records().into()); + state.set_merged_records(merged_records.into()); + let mut merged_record = MergedRecord::default(); + merged_record.set_source_region_id(source_region.get_id()); + merged_record.set_source_epoch(source_region.get_region_epoch().clone()); + merged_record.set_source_peers(source_region.get_peers().into()); + merged_record.set_target_region_id(region.get_id()); + merged_record.set_target_epoch(region.get_region_epoch().clone()); + merged_record.set_target_peers(region.get_peers().into()); + merged_record.set_index(index); + state.mut_merged_records().push(merged_record); + + PEER_ADMIN_CMD_COUNTER.commit_merge.success.inc(); + + Ok(( + AdminResponse::default(), + AdminCmdResult::CommitMerge(CommitMergeResult { + index, + prepare_merge_index: merge_commit, + source_path, + region_state: self.region_state().clone(), + source: source_region.to_owned(), + source_safe_ts, + tablet: Box::new(tablet), + }), + )) + } +} + +// Source peer catches up logs (optionally), and destroy itself. +impl Peer { + // Target peer. + #[inline] + pub fn on_redirect_catch_up_logs( + &mut self, + store_ctx: &mut StoreContext, + catch_up_logs: CatchUpLogs, + ) { + let source_id = catch_up_logs.merge.get_source_state().get_region().get_id(); + assert_eq!(catch_up_logs.target_region_id, self.region_id()); + let _ = store_ctx + .router + .force_send(source_id, PeerMsg::CatchUpLogs(catch_up_logs)); + } + + // Match v1::on_catch_up_logs_for_merge. + pub fn on_catch_up_logs( + &mut self, + store_ctx: &mut StoreContext, + mut catch_up_logs: CatchUpLogs, + ) { + let source_id = catch_up_logs.merge.get_source_state().get_region().get_id(); + if source_id != self.region_id() { + slog_panic!( + self.logger, + "get unexpected catch_up_logs"; + "merge" => ?catch_up_logs.merge, + ); + } + + // Context would be empty if this peer hasn't applied PrepareMerge. + if let Some(cul) = self.merge_context().and_then(|c| c.catch_up_logs.as_ref()) { + slog_panic!( + self.logger, + "get conflicting catch_up_logs"; + "new" => ?catch_up_logs.merge, + "current" => ?cul.merge, + ); + } + if !self.catch_up_logs_ready(&catch_up_logs) { + // Directly append these logs to raft log and then commit them. + match self.maybe_append_merge_entries(&catch_up_logs.merge) { + Some(last_index) => { + info!( + self.logger, + "append and commit entries to source region"; + "last_index" => last_index, + ); + self.set_has_ready(); + } + None => { + info!(self.logger, "no need to catch up logs"); + } + } + catch_up_logs.merge.clear_entries(); + self.merge_context_mut().catch_up_logs = Some(catch_up_logs); + } else { + self.finish_catch_up_logs(store_ctx, catch_up_logs); + } + } + + #[inline] + fn catch_up_logs_ready(&self, catch_up_logs: &CatchUpLogs) -> bool { + if let Some(state) = self.applied_merge_state() + && state.get_commit() == commit_of_merge(&catch_up_logs.merge) + { + assert_eq!( + state.get_target().get_id(), + catch_up_logs.target_region_id + ); + true + } else { + false + } + } + + fn maybe_append_merge_entries(&mut self, merge: &CommitMergeRequest) -> Option { + let mut entries = merge.get_entries(); + let merge_commit = commit_of_merge(merge); + if entries.is_empty() { + // Though the entries is empty, it is possible that one source peer has caught + // up the logs but commit index is not updated. If other source peers are + // already destroyed, so the raft group will not make any progress, namely the + // source peer can not get the latest commit index anymore. + // Here update the commit index to let source apply rest uncommitted entries. + return if merge_commit > self.raft_group().raft.raft_log.committed { + self.raft_group_mut().raft.raft_log.commit_to(merge_commit); + Some(merge_commit) + } else { + None + }; + } + let first = entries.first().unwrap(); + // make sure message should be with index not smaller than committed + let mut log_idx = first.get_index() - 1; + debug!( + self.logger, + "append merge entries"; + "log_index" => log_idx, + "merge_commit" => merge_commit, + "commit_index" => self.raft_group().raft.raft_log.committed, + ); + if log_idx < self.raft_group().raft.raft_log.committed { + // There may be some logs not included in CommitMergeRequest's entries, like + // CompactLog, so the commit index may exceed the last index of the entires from + // CommitMergeRequest. If that, no need to append + if self.raft_group().raft.raft_log.committed - log_idx >= entries.len() as u64 { + return None; + } + entries = &entries[(self.raft_group().raft.raft_log.committed - log_idx) as usize..]; + log_idx = self.raft_group().raft.raft_log.committed; + } + let log_term = self.index_term(log_idx); + + let last_log = entries.last().unwrap(); + if last_log.term > self.term() { + // Hack: In normal flow, when leader sends the entries, it will use a term + // that's not less than the last log term. And follower will update its states + // correctly. For merge, we append the log without raft, so we have to take care + // of term explicitly to get correct metadata. + info!( + self.logger, + "become follower for new logs"; + "new_log_term" => last_log.term, + "new_log_index" => last_log.index, + "term" => self.term(), + ); + self.raft_group_mut() + .raft + .become_follower(last_log.term, INVALID_ID); + } + + self.raft_group_mut() + .raft + .raft_log + .maybe_append(log_idx, log_term, merge_commit, entries) + .map(|(_, last_index)| last_index) + } + + #[inline] + fn finish_catch_up_logs(&mut self, store_ctx: &mut StoreContext, c: CatchUpLogs) { + let safe_ts = store_ctx + .store_meta + .lock() + .unwrap() + .region_read_progress + .get(&self.region_id()) + .unwrap() + .safe_ts(); + if c.tx.send(safe_ts).is_err() { + error!( + self.logger, + "failed to respond to merge target, are we shutting down?" + ); + } + self.take_merge_context(); + self.mark_for_destroy(None); + } +} + +impl Peer { + // Match v1::on_ready_commit_merge. + pub fn on_apply_res_commit_merge( + &mut self, + store_ctx: &mut StoreContext, + mut res: CommitMergeResult, + ) { + let region = res.region_state.get_region(); + assert!( + res.source.get_end_key() == region.get_end_key() + || res.source.get_start_key() == region.get_start_key() + ); + let tablet: EK = match res.tablet.downcast() { + Ok(t) => *t, + Err(t) => unreachable!("tablet type should be the same: {:?}", t), + }; + let acquired_source_safe_ts_before = res.source_safe_ts > 0; + + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + if let Some(p) = meta.region_read_progress.get(&res.source.get_id()) { + res.source_safe_ts = p.safe_ts(); + } + meta.set_region(region, true, &self.logger); + let (reader, read_tablet) = meta.readers.get_mut(®ion.get_id()).unwrap(); + self.set_region( + &store_ctx.coprocessor_host, + reader, + region.clone(), + RegionChangeReason::CommitMerge, + res.index, + ); + + // Tablet should be updated in lock to match the epoch. + *read_tablet = SharedReadTablet::new(tablet.clone()); + + // After the region commit merged, the region's key range is extended and the + // region's `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)` + self.read_progress_mut().merge_safe_ts( + res.source_safe_ts, + res.index, + &store_ctx.coprocessor_host, + ); + self.txn_context() + .after_commit_merge(store_ctx, self.term(), region, &self.logger); + } + + // We could only have gotten safe ts by sending `CatchUpLogs` earlier. If we + // haven't, need to acknowledge that we have committed the merge, so that the + // source peer can destroy itself. Note that the timing is deliberately + // delayed after reading `store_ctx.meta` to get the source safe ts + // before its meta gets cleaned up. + if !acquired_source_safe_ts_before { + let _ = store_ctx.router.force_send( + res.source.get_id(), + PeerMsg::AckCommitMerge { + index: res.prepare_merge_index, + target_id: self.region_id(), + }, + ); + } + + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(store_ctx, tablet, res.index); + } + self.record_tombstone_tablet_path(store_ctx, res.source_path, res.index); + + // make approximate size and keys updated in time. + // the reason why follower need to update is that there is a issue that after + // merge and then transfer leader, the new leader may have stale size and keys. + self.force_split_check(store_ctx); + self.region_buckets_info_mut().set_bucket_stat(None); + + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, res.index, &res.region_state) + .unwrap(); + self.storage_mut().set_region_state(res.region_state); + self.storage_mut() + .apply_trace_mut() + .on_admin_flush(res.index); + self.set_has_extra_write(); + + if self.is_leader() { + self.region_heartbeat_pd(store_ctx); + info!( + self.logger, + "notify pd with merge"; + "source_region" => ?res.source, + "target_region" => ?self.region(), + ); + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + } + + // Called on source peer. + pub fn on_ack_commit_merge(&mut self, index: u64, target_id: u64) { + // We don't check it against merge state because source peer might just restart + // and haven't replayed `PrepareMerge` yet. + info!(self.logger, "destroy self on AckCommitMerge"; "index" => index, "target_id" => target_id); + self.take_merge_context(); + self.mark_for_destroy(None); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/merge/mod.rs b/components/raftstore-v2/src/operation/command/admin/merge/mod.rs new file mode 100644 index 00000000000..0b198eec2a6 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/mod.rs @@ -0,0 +1,143 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod commit; +pub mod prepare; +pub mod rollback; + +use std::path::PathBuf; + +use commit::CatchUpLogs; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use kvproto::{ + raft_cmdpb::RaftCmdRequest, + raft_serverpb::{MergeState, PeerState, RegionLocalState}, +}; +use prepare::PrepareStatus; +use raft::{ProgressState, INVALID_INDEX}; +use raftstore::Result; +use slog::{info, warn, Logger}; +use tikv_util::box_err; + +use crate::raft::Peer; + +pub const MERGE_SOURCE_PREFIX: &str = "merge-source"; + +// `index` is the commit index of `PrepareMergeRequest`, `commit` field of +// `CommitMergeRequest`. +fn merge_source_path( + registry: &TabletRegistry, + source_region_id: u64, + index: u64, +) -> PathBuf { + let tablet_name = registry.tablet_name(MERGE_SOURCE_PREFIX, source_region_id, index); + registry.tablet_root().join(tablet_name) +} + +/// This context is only used at source region. +#[derive(Default)] +pub struct MergeContext { + prepare_status: Option, + catch_up_logs: Option, +} + +impl MergeContext { + #[inline] + pub fn from_region_state(logger: &Logger, state: &RegionLocalState) -> Option { + if state.get_state() == PeerState::Merging { + info!(logger, "region is merging"; "region_state" => ?state); + let mut ctx = Self::default(); + ctx.prepare_status = Some(PrepareStatus::Applied(state.get_merge_state().clone())); + Some(ctx) + } else { + None + } + } + + #[inline] + pub fn maybe_take_pending_prepare(&mut self, applied: u64) -> Option { + if let Some(PrepareStatus::WaitForFence { + fence, + req, + .. + }) = self.prepare_status.as_mut() + && applied >= *fence + { + // The status will be updated during processing the proposal. + return req.take(); + } + None + } + + #[inline] + pub fn max_compact_log_index(&self) -> Option { + if let Some(PrepareStatus::WaitForFence { ctx, .. }) = self.prepare_status.as_ref() { + Some(ctx.min_matched) + } else { + None + } + } +} + +impl Peer { + #[inline] + pub fn update_merge_progress_on_became_follower(&mut self) { + if let Some(ctx) = self.merge_context() + && matches!(ctx.prepare_status, Some(PrepareStatus::WaitForFence { .. })) + { + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + } + } + + /// Returns (minimal matched, minimal committed) + fn calculate_min_progress(&self) -> Result<(u64, u64)> { + let (mut min_m, mut min_c) = (None, None); + if let Some(progress) = self.raft_group().status().progress { + for (id, pr) in progress.iter() { + // Reject merge if there is any pending request snapshot, + // because a target region may merge a source region which is in + // an invalid state. + if pr.state == ProgressState::Snapshot + || pr.pending_request_snapshot != INVALID_INDEX + { + return Err(box_err!( + "there is a pending snapshot peer {} [{:?}], skip merge", + id, + pr + )); + } + if min_m.unwrap_or(u64::MAX) > pr.matched { + min_m = Some(pr.matched); + } + if min_c.unwrap_or(u64::MAX) > pr.committed_index { + min_c = Some(pr.committed_index); + } + } + } + let (mut min_m, min_c) = (min_m.unwrap_or(0), min_c.unwrap_or(0)); + if min_m < min_c { + warn!( + self.logger, + "min_matched < min_committed, raft progress is inaccurate"; + "min_matched" => min_m, + "min_committed" => min_c, + ); + // Reset `min_matched` to `min_committed`, since the raft log at `min_committed` + // is known to be committed in all peers, all of the peers should also have + // replicated it + min_m = min_c; + } + Ok((min_m, min_c)) + } + + #[inline] + fn applied_merge_state(&self) -> Option<&MergeState> { + self.merge_context().and_then(|ctx| { + if let Some(PrepareStatus::Applied(state)) = ctx.prepare_status.as_ref() { + Some(state) + } else { + None + } + }) + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs new file mode 100644 index 00000000000..601b4568866 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -0,0 +1,674 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! The handling of `PrepareMerge` command. +//! +//! ## Propose (`Peer::propose_prepare_merge`) +//! +//! Checks for these requirements: +//! +//! - Validate the request. (`Peer::validate_prepare_merge_command`) +//! - Log gap between source region leader and peers is not too large. This is +//! because these logs need to be embeded in the later `CommitMerge` command. +//! - Logs that aren't fully committed (to all peers) does not contains +//! `CompactLog` or certain admin commands. +//! +//! Then, transfer all in-memory pessimistic locks to the target region as a +//! Raft proposal. To guarantee the consistency of lock serialization, we might +//! need to wait for some in-flight logs to be applied. During the wait, all +//! incoming write proposals will be rejected. Read the comments of +//! `PrepareStatus::WaitForFence` for more details. +//! +//! ## Apply (`Apply::apply_prepare_merge`) +//! +//! Increase region epoch and write the merge state. +//! +//! ## On Apply Result (`Peer::on_apply_res_prepare_merge`) +//! +//! Start the tick (`Peer::on_check_merge`) to periodically check the +//! eligibility of merge. + +use std::{mem, time::Duration}; + +use collections::HashMap; +use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, CF_LOCK}; +use kvproto::{ + metapb::RegionEpoch, + raft_cmdpb::{ + AdminCmdType, AdminRequest, AdminResponse, CmdType, PrepareMergeRequest, PutRequest, + RaftCmdRequest, Request, + }, + raft_serverpb::{ + ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftMessage, RegionLocalState, + }, +}; +use parking_lot::RwLockUpgradableReadGuard; +use protobuf::Message; +use raft::{eraftpb::EntryType, GetEntriesContext, NO_LIMIT}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{metrics::PEER_ADMIN_CMD_COUNTER, util, LocksStatus, ProposalContext, Transport}, + Error, Result, +}; +use slog::{debug, info}; +use tikv_util::{ + box_err, log::SlogFormat, slog_panic, store::region_on_same_stores, time::Instant, +}; + +use super::merge_source_path; +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::{command::parse_at, AdminCmdResult, SimpleWriteReqDecoder}, + raft::{Apply, Peer}, + router::CmdResChannel, +}; + +const TRIM_CHECK_TIMEOUT: Duration = Duration::from_secs(10); + +#[derive(Clone)] +pub struct PreProposeContext { + pub min_matched: u64, + lock_size_limit: usize, +} + +pub enum PrepareStatus { + WaitForTrimStatus { + start_time: Instant, + // Peers that we are not sure if trimmed. + pending_peers: HashMap, + req: Option, + }, + /// When a fence is present, we (1) delay the PrepareMerge + /// command `cmd` until all writes before `idx` are applied (2) reject all + /// in-coming write proposals. + /// Before proposing `PrepareMerge`, we first serialize and propose the lock + /// table. Locks marked as deleted (but not removed yet) will be + /// serialized as normal locks. + /// Thanks to the fence, we can ensure at the time of lock transfer, locks + /// are either removed (when applying logs) or won't be removed before + /// merge (the proposals to remove them are rejected). + /// + /// The request can be `None` because we needs to take it out to redo the + /// propose. In the meantime the fence is needed to bypass the check. + WaitForFence { + fence: u64, + ctx: PreProposeContext, + req: Option, + }, + /// In this state, all write proposals except for `RollbackMerge` will be + /// rejected. + Applied(MergeState), +} + +#[derive(Debug)] +pub struct PrepareMergeResult { + region_state: RegionLocalState, + state: MergeState, +} + +impl Peer { + pub fn propose_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ) -> Result { + self.validate_prepare_merge_command( + store_ctx, + req.get_admin_request().get_prepare_merge(), + )?; + // We need to check three things in order: + // (1) `start_check_trim_status` + // (2) `check_logs_before_prepare_merge` + // (3) `check_pessimistic_locks` + // Check 1 and 3 are async, they yield by returning + // `Error::PendingPrepareMerge`. + let pre_propose = if let Some(r) = self.already_checked_pessimistic_locks()? { + r + } else if self.already_checked_trim_status()? { + let r = self.check_logs_before_prepare_merge(store_ctx)?; + self.check_pessimistic_locks(r, &mut req)? + } else { + return self.start_check_trim_status(store_ctx, &mut req); + }; + req.mut_admin_request() + .mut_prepare_merge() + .set_min_index(pre_propose.min_matched + 1); + let r = self + .propose_locks_before_prepare_merge(store_ctx, pre_propose.lock_size_limit) + .and_then(|_| { + let mut proposal_ctx = ProposalContext::empty(); + proposal_ctx.insert(ProposalContext::PREPARE_MERGE); + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + }); + if r.is_ok() { + self.proposal_control_mut().set_pending_prepare_merge(false); + } else { + // Match v1::post_propose_fail. + // If we just failed to propose PrepareMerge, the pessimistic locks status + // may become MergingRegion incorrectly. So, we have to revert it here. + // Note: The `is_merging` check from v1 is removed because proposed + // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). + assert!( + !self.proposal_control().is_merging(), + "{}", + SlogFormat(&self.logger) + ); + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); + if pessimistic_locks.status == LocksStatus::MergingRegion { + pessimistic_locks.status = LocksStatus::Normal; + } + } + r + } + + /// Match v1::check_merge_proposal. + /// - Target region epoch as requested is identical with the local version. + /// - Target region is a sibling to the source region. + /// - Peers of both source and target region are aligned, i.e. located on + /// the same set of stores. + fn validate_prepare_merge_command( + &mut self, + store_ctx: &mut StoreContext, + req: &PrepareMergeRequest, + ) -> Result<()> { + // Just for simplicity, do not start region merge while in joint state + if self.in_joint_state() { + return Err(box_err!( + "{} region in joint state, can not propose merge command, command: {:?}", + SlogFormat(&self.logger), + req + )); + } + let region = self.region(); + let target_region = req.get_target(); + { + let store_meta = store_ctx.store_meta.lock().unwrap(); + match store_meta.regions.get(&target_region.get_id()) { + Some((region, _)) if *region != *target_region => { + return Err(box_err!( + "target region not matched, skip proposing: {:?} != {:?}", + region, + target_region + )); + } + None => { + return Err(box_err!( + "target region {} doesn't exist.", + target_region.get_id() + )); + } + _ => {} + } + } + + if !util::is_sibling_regions(target_region, region) { + return Err(box_err!( + "{:?} and {:?} are not sibling, skip proposing.", + target_region, + region + )); + } + if !region_on_same_stores(target_region, region) { + return Err(box_err!( + "peers doesn't match {:?} != {:?}, reject merge", + region.get_peers(), + target_region.get_peers() + )); + } + Ok(()) + } + + // Match v1::pre_propose_prepare_merge. + fn check_logs_before_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + ) -> Result { + let last_index = self.raft_group().raft.raft_log.last_index(); + let (min_matched, min_committed) = self.calculate_min_progress()?; + if min_matched == 0 + || min_committed == 0 + || last_index - min_matched > store_ctx.cfg.merge_max_log_gap + || last_index - min_committed > store_ctx.cfg.merge_max_log_gap * 2 + || min_matched < self.last_sent_snapshot_index() + { + return Err(box_err!( + "log gap too large, skip merge: matched: {}, committed: {}, last index: {}", + min_matched, + min_committed, + last_index + )); + } + let mut entry_size = 0; + for entry in self.raft_group().raft.raft_log.entries( + min_committed + 1, + NO_LIMIT, + GetEntriesContext::empty(false), + )? { + // commit merge only contains entries start from min_matched + 1 + if entry.index > min_matched { + entry_size += entry.get_data().len(); + } + if entry.get_entry_type() == EntryType::EntryConfChange + || entry.get_entry_type() == EntryType::EntryConfChangeV2 + { + return Err(box_err!( + "{} log gap contains conf change, skip merging.", + "tag" + )); + } + if entry.get_data().is_empty() { + continue; + } + let Err(cmd) = SimpleWriteReqDecoder::new( + |buf, index, term| parse_at(&self.logger, buf, index, term), + &self.logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ) else { continue }; + let cmd_type = cmd.get_admin_request().get_cmd_type(); + match cmd_type { + AdminCmdType::TransferLeader + | AdminCmdType::ComputeHash + | AdminCmdType::VerifyHash + | AdminCmdType::InvalidAdmin => continue, + _ => {} + } + // Any command that can change epoch or log gap should be rejected. + return Err(box_err!( + "log gap contains admin request {:?}, skip merging.", + cmd_type + )); + } + let entry_size_limit = store_ctx.cfg.raft_entry_max_size.0 as usize * 9 / 10; + if entry_size > entry_size_limit { + return Err(box_err!( + "log gap size exceed entry size limit, skip merging." + )); + }; + Ok(PreProposeContext { + min_matched, + lock_size_limit: entry_size_limit - entry_size, + }) + } + + fn start_check_trim_status( + &mut self, + store_ctx: &mut StoreContext, + req: &mut RaftCmdRequest, + ) -> Result { + if self.storage().has_dirty_data() { + return Err(box_err!( + "source peer {} not trimmed, skip merging.", + self.peer_id() + )); + } + let target = req.get_admin_request().get_prepare_merge().get_target(); + let mut pending_peers = HashMap::default(); + for region in [self.region(), target] { + for p in region.get_peers() { + if p.get_id() == self.peer_id() { + continue; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(region.get_id()); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(p.clone()); + msg.set_region_epoch(region.get_region_epoch().clone()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgAvailabilityRequest); + msg.mut_extra_msg() + .mut_availability_context() + .set_from_region_id(self.region_id()); + store_ctx.trans.send(msg)?; + pending_peers.insert(p.get_id(), region.get_region_epoch().clone()); + } + } + + let status = &mut self.merge_context_mut().prepare_status; + // Shouldn't enter this call if trim check is already underway. + assert!(status.is_none()); + *status = Some(PrepareStatus::WaitForTrimStatus { + start_time: Instant::now_coarse(), + pending_peers, + req: Some(mem::take(req)), + }); + Err(Error::PendingPrepareMerge) + } + + pub fn merge_on_availability_response( + &mut self, + store_ctx: &mut StoreContext, + from_peer: u64, + resp: &ExtraMessage, + ) { + if self.merge_context().is_some() + && let Some(PrepareStatus::WaitForTrimStatus { pending_peers, req, .. }) = self + .merge_context_mut() + .prepare_status + .as_mut() + && req.is_some() + { + assert!(resp.has_availability_context()); + let from_region = resp.get_availability_context().get_from_region_id(); + let from_epoch = resp.get_availability_context().get_from_region_epoch(); + let trimmed = resp.get_availability_context().get_trimmed(); + if let Some(epoch) = pending_peers.get(&from_peer) + && util::is_region_epoch_equal(from_epoch, epoch) + { + if !trimmed { + info!( + self.logger, + "cancel merge because source peer is not trimmed"; + "region_id" => from_region, + "peer_id" => from_peer, + ); + self.take_merge_context(); + return; + } else { + pending_peers.remove(&from_peer); + } + } + if pending_peers.is_empty() { + let (ch, _) = CmdResChannel::pair(); + let req = req.take().unwrap(); + self.on_admin_command(store_ctx, req, ch); + } + } + } + + fn already_checked_trim_status(&mut self) -> Result { + match self + .merge_context() + .as_ref() + .and_then(|c| c.prepare_status.as_ref()) + { + Some(PrepareStatus::WaitForTrimStatus { pending_peers, .. }) => { + if pending_peers.is_empty() { + Ok(true) + } else { + Err(Error::PendingPrepareMerge) + } + } + None => Ok(false), + // Shouldn't reach here after calling `already_checked_pessimistic_locks` first. + _ => unreachable!(), + } + } + + fn check_pessimistic_locks( + &mut self, + ctx: PreProposeContext, + req: &mut RaftCmdRequest, + ) -> Result { + let has_locks = { + let pessimistic_locks = self.txn_context().ext().pessimistic_locks.read(); + if pessimistic_locks.status != LocksStatus::Normal { + // If `status` is not `Normal`, it means the in-memory pessimistic locks are + // being transferred, probably triggered by transferring leader. In this case, + // we abort merging to simplify the situation. + return Err(box_err!( + "pessimistic locks status is {:?}, skip merging.", + pessimistic_locks.status + )); + } + !pessimistic_locks.is_empty() + }; + let last_index = self.raft_group().raft.raft_log.last_index(); + if has_locks && self.entry_storage().applied_index() < last_index { + self.merge_context_mut().prepare_status = Some(PrepareStatus::WaitForFence { + fence: last_index, + ctx, + req: Some(mem::take(req)), + }); + self.proposal_control_mut().set_pending_prepare_merge(true); + info!( + self.logger, + "start rejecting new proposals before prepare merge"; + "prepare_merge_fence" => last_index + ); + return Err(Error::PendingPrepareMerge); + } + Ok(ctx) + } + + fn already_checked_pessimistic_locks(&mut self) -> Result> { + let applied_index = self.entry_storage().applied_index(); + match self + .merge_context() + .as_ref() + .and_then(|c| c.prepare_status.as_ref()) + { + Some(PrepareStatus::WaitForFence { fence, ctx, .. }) => { + if applied_index < *fence { + info!( + self.logger, + "suspend PrepareMerge because applied_index has not reached prepare_merge_fence"; + "applied_index" => applied_index, + "prepare_merge_fence" => fence, + ); + Err(Error::PendingPrepareMerge) + } else { + Ok(Some(ctx.clone())) + } + } + Some(PrepareStatus::Applied(state)) => Err(box_err!( + "another merge is in-progress, merge_state: {:?}.", + state + )), + _ => Ok(None), + } + } + + #[inline] + pub fn maybe_clean_up_stale_merge_context(&mut self) { + // Check if there's a stale trim check. Ideally this should be implemented as a + // tick. But this is simpler. + if let Some(PrepareStatus::WaitForTrimStatus { + start_time, req, .. + }) = self + .merge_context() + .as_ref() + .and_then(|c| c.prepare_status.as_ref()) + && req.is_some() + && start_time.saturating_elapsed() > TRIM_CHECK_TIMEOUT + { + info!(self.logger, "cancel merge because trim check timed out"); + self.take_merge_context(); + } + } + + /// Called after some new entries have been applied and the fence can + /// probably be lifted. + pub fn retry_pending_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + applied_index: u64, + ) { + if self.merge_context().is_none() { + return; + } + // Check the fence. + if let Some(req) = self + .merge_context_mut() + .maybe_take_pending_prepare(applied_index) + { + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + } + } + + fn propose_locks_before_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + size_limit: usize, + ) -> Result<()> { + let pessimistic_locks = self.txn_context().ext().pessimistic_locks.upgradable_read(); + if pessimistic_locks.is_empty() { + let mut pessimistic_locks = RwLockUpgradableReadGuard::upgrade(pessimistic_locks); + pessimistic_locks.status = LocksStatus::MergingRegion; + return Ok(()); + } + + // The proposed pessimistic locks here will also be carried in CommitMerge. + // Check the size to avoid CommitMerge exceeding the size limit of a raft entry. + // This check is a inaccurate check. We will check the size again accurately + // later using the protobuf encoding. + if pessimistic_locks.memory_size > size_limit { + return Err(box_err!( + "pessimistic locks size {} exceed size limit {}, skip merging.", + pessimistic_locks.memory_size, + size_limit + )); + } + + let mut cmd = RaftCmdRequest::default(); + for (key, (lock, _deleted)) in &*pessimistic_locks { + let mut put = PutRequest::default(); + put.set_cf(CF_LOCK.to_string()); + put.set_key(key.as_encoded().to_owned()); + put.set_value(lock.to_lock().to_bytes()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Put); + req.set_put(put); + cmd.mut_requests().push(req); + } + cmd.mut_header().set_region_id(self.region_id()); + cmd.mut_header() + .set_region_epoch(self.region().get_region_epoch().clone()); + cmd.mut_header().set_peer(self.peer().clone()); + let proposal_size = cmd.compute_size(); + if proposal_size as usize > size_limit { + return Err(box_err!( + "pessimistic locks size {} exceed size limit {}, skip merging.", + proposal_size, + size_limit + )); + } + + { + let mut pessimistic_locks = RwLockUpgradableReadGuard::upgrade(pessimistic_locks); + pessimistic_locks.status = LocksStatus::MergingRegion; + } + debug!( + self.logger, + "propose {} pessimistic locks before prepare merge", + cmd.get_requests().len(); + ); + self.propose(store_ctx, cmd.write_to_bytes().unwrap())?; + Ok(()) + } +} + +impl Apply { + // Match v1::exec_prepare_merge. + pub fn apply_prepare_merge( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.prepare_merge.all.inc(); + + let prepare_merge = req.get_prepare_merge(); + let index = prepare_merge.get_min_index(); + // Note: the check against first_index is removed in v2. + let mut region = self.region().clone(); + let region_version = region.get_region_epoch().get_version() + 1; + region.mut_region_epoch().set_version(region_version); + // In theory conf version should not be increased when executing prepare_merge. + // However, we don't want to do conf change after prepare_merge is committed. + // This can also be done by iterating all proposal to find if prepare_merge is + // proposed before proposing conf change, but it make things complicated. + // Another way is make conf change also check region version, but this is not + // backward compatible. + let conf_version = region.get_region_epoch().get_conf_ver() + 1; + region.mut_region_epoch().set_conf_ver(conf_version); + let mut merging_state = MergeState::default(); + merging_state.set_min_index(index); + merging_state.set_target(prepare_merge.get_target().to_owned()); + merging_state.set_commit(log_index); + + self.region_state_mut().set_region(region.clone()); + self.region_state_mut().set_state(PeerState::Merging); + assert!( + !self.region_state().has_merge_state(), + "{:?}", + self.region_state() + ); + self.region_state_mut() + .set_merge_state(merging_state.clone()); + + PEER_ADMIN_CMD_COUNTER.prepare_merge.success.inc(); + + let _ = self.flush(); + let tablet = self.tablet().clone(); + let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "error" => ?e + ) + }); + let reg = self.tablet_registry(); + let path = merge_source_path(reg, self.region_id(), log_index); + // We might be replaying this command. + if !path.exists() { + checkpointer.create_at(&path, None, 0).unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %path.display(), + "error" => ?e + ) + }); + } + + Ok(( + AdminResponse::default(), + AdminCmdResult::PrepareMerge(PrepareMergeResult { + region_state: self.region_state().clone(), + state: merging_state, + }), + )) + } +} + +impl Peer { + // Match v1::on_ready_prepare_merge. + pub fn on_apply_res_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + res: PrepareMergeResult, + ) { + let region = res.region_state.get_region().clone(); + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + meta.set_region(®ion, true, &self.logger); + let (reader, _) = meta.readers.get_mut(®ion.get_id()).unwrap(); + self.set_region( + &store_ctx.coprocessor_host, + reader, + region, + RegionChangeReason::PrepareMerge, + res.state.get_commit(), + ); + } + + self.storage_mut() + .set_region_state(res.region_state.clone()); + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, res.state.get_commit(), &res.region_state) + .unwrap(); + self.set_has_extra_write(); + + self.proposal_control_mut() + .enter_prepare_merge(res.state.get_commit()); + self.merge_context_mut().prepare_status = Some(PrepareStatus::Applied(res.state)); + + self.start_commit_merge(store_ctx); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs new file mode 100644 index 00000000000..ab571298bb0 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs @@ -0,0 +1,12 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use slog::warn; + +use crate::raft::Peer; + +impl Peer { + pub fn on_reject_commit_merge(&mut self, index: u64) { + warn!(self.logger, "target peer rejected commit merge"; "index" => index); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs new file mode 100644 index 00000000000..69c9b39aaa2 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -0,0 +1,289 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod compact_log; +mod conf_change; +mod merge; +mod split; +mod transfer_leader; + +pub use compact_log::CompactLogContext; +use compact_log::CompactLogResult; +use conf_change::{ConfChangeResult, UpdateGcPeersResult}; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb::PeerRole, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::{ExtraMessageType, FlushMemtable, RaftMessage}, +}; +use merge::{commit::CommitMergeResult, prepare::PrepareMergeResult}; +pub use merge::{ + commit::{CatchUpLogs, MERGE_IN_PROGRESS_PREFIX}, + MergeContext, MERGE_SOURCE_PREFIX, +}; +use protobuf::Message; +use raftstore::{ + store::{ + cmd_resp, + fsm::{apply, apply::validate_batch_split}, + msg::ErrorCallback, + Transport, + }, + Error, +}; +use slog::{error, info}; +use split::SplitResult; +pub use split::{ + report_split_init_finish, temp_split_path, RequestHalfSplit, RequestSplit, SplitFlowControl, + SplitInit, SPLIT_PREFIX, +}; +use tikv_util::{box_err, log::SlogFormat}; +use txn_types::WriteBatchFlags; + +use crate::{ + batch::StoreContext, + raft::Peer, + router::{CmdResChannel, PeerMsg, RaftRequest}, +}; + +#[derive(Debug)] +pub enum AdminCmdResult { + // No side effect produced by the command + None, + SplitRegion(SplitResult), + ConfChange(ConfChangeResult), + TransferLeader(u64), + CompactLog(CompactLogResult), + UpdateGcPeers(UpdateGcPeersResult), + PrepareMerge(PrepareMergeResult), + CommitMerge(CommitMergeResult), +} + +impl Peer { + #[inline] + pub fn on_admin_command( + &mut self, + ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ch: CmdResChannel, + ) { + if !self.serving() { + apply::notify_req_region_removed(self.region_id(), ch); + return; + } + if !req.has_admin_request() { + let e = box_err!( + "{} expect only execute admin command", + SlogFormat(&self.logger) + ); + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + if let Err(e) = ctx.coprocessor_host.pre_propose(self.region(), &mut req) { + let resp = cmd_resp::new_error(e.into()); + ch.report_error(resp); + return; + } + let cmd_type = req.get_admin_request().get_cmd_type(); + if let Err(e) = + self.validate_command(req.get_header(), Some(cmd_type), &mut ctx.raft_metrics) + { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + + let pre_transfer_leader = cmd_type == AdminCmdType::TransferLeader + && !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL); + + // The admin request is rejected because it may need to update epoch checker + // which introduces an uncertainty and may breaks the correctness of epoch + // checker. + // As pre transfer leader is just a warmup phase, applying to the current term + // is not required. + if !self.applied_to_current_term() && !pre_transfer_leader { + let e = box_err!( + "{} peer has not applied to current term, applied_term {}, current_term {}", + SlogFormat(&self.logger), + self.storage().entry_storage().applied_term(), + self.term() + ); + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + if let Some(conflict) = self.proposal_control_mut().check_conflict(Some(cmd_type)) { + conflict.delay_channel(ch); + return; + } + if self.proposal_control().has_pending_prepare_merge() + && cmd_type != AdminCmdType::PrepareMerge + || self.proposal_control().is_merging() && cmd_type != AdminCmdType::RollbackMerge + { + let resp = cmd_resp::new_error(Error::ProposalInMergingMode(self.region_id())); + ch.report_error(resp); + return; + } + // To maintain propose order, we need to make pending proposal first. + self.propose_pending_writes(ctx); + let res = if apply::is_conf_change_cmd(&req) { + self.propose_conf_change(ctx, req) + } else { + // propose other admin command. + match cmd_type { + AdminCmdType::Split => Err(box_err!( + "Split is deprecated. Please use BatchSplit instead." + )), + AdminCmdType::BatchSplit => { + #[allow(clippy::question_mark)] + if let Err(err) = validate_batch_split(req.get_admin_request(), self.region()) { + Err(err) + } else { + // To reduce the impact of the expensive operation of `checkpoint` (it will + // flush memtables of the rocksdb) in applying batch split, we split the + // BatchSplit cmd into two phases: + // + // 1. Schedule flush memtable task so that the memtables of the rocksdb can + // be flushed in advance in a way that will not block the normal raft + // operations (`checkpoint` will still cause flush but it will be + // significantly lightweight). At the same time, send flush memtable msgs to + // the follower so that they can flush memtalbes in advance too. + // + // 2. When the task finishes, it will propose a batch split with + // `PRE_FLUSH_FINISHED` flag. + if !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::PRE_FLUSH_FINISHED) + { + if self.tablet_being_flushed() { + return; + } + + let region_id = self.region().get_id(); + self.set_tablet_being_flushed(true); + info!( + self.logger, + "Schedule flush tablet"; + ); + + let mailbox = match ctx.router.mailbox(region_id) { + Some(mailbox) => mailbox, + None => { + // None means the node is shutdown concurrently and thus the + // mailboxes in router have been cleared + assert!( + ctx.router.is_shutdown(), + "{} router should have been closed", + SlogFormat(&self.logger) + ); + return; + } + }; + + let logger = self.logger.clone(); + let on_flush_finish = move || { + req.mut_header() + .set_flags(WriteBatchFlags::PRE_FLUSH_FINISHED.bits()); + if let Err(e) = mailbox + .try_send(PeerMsg::AdminCommand(RaftRequest::new(req, ch))) + { + error!( + logger, + "send split request fail after pre-flush finished"; + "err" => ?e, + ); + } + }; + + if let Err(e) = + ctx.schedulers.tablet.schedule(crate::TabletTask::Flush { + region_id, + cb: Some(Box::new(on_flush_finish)), + }) + { + error!( + self.logger, + "Fail to schedule flush task"; + "err" => ?e, + ) + } + + // Notify followers to flush their relevant memtables + let peers = self.region().get_peers().to_vec(); + for p in peers { + if p == *self.peer() + || p.get_role() != PeerRole::Voter + || p.is_witness + { + continue; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(region_id); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(p.clone()); + msg.set_region_epoch(self.region().get_region_epoch().clone()); + let extra_msg = msg.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgFlushMemtable); + let mut flush_memtable = FlushMemtable::new(); + flush_memtable.set_region_id(region_id); + extra_msg.set_flush_memtable(flush_memtable); + + self.send_raft_message(ctx, msg); + } + + return; + } + + info!( + self.logger, + "Propose split"; + ); + self.set_tablet_being_flushed(false); + self.propose_split(ctx, req) + } + } + AdminCmdType::TransferLeader => { + // Containing TRANSFER_LEADER_PROPOSAL flag means the this transfer leader + // request should be proposed to the raft group + if WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL) + { + let data = req.write_to_bytes().unwrap(); + self.propose(ctx, data) + } else { + if self.propose_transfer_leader(ctx, req, ch) { + self.set_has_ready(); + } + return; + } + } + AdminCmdType::CompactLog => self.propose_compact_log(ctx, req), + AdminCmdType::UpdateGcPeer => { + let data = req.write_to_bytes().unwrap(); + self.propose(ctx, data) + } + AdminCmdType::PrepareMerge => self.propose_prepare_merge(ctx, req), + AdminCmdType::CommitMerge => self.propose_commit_merge(ctx, req), + _ => unimplemented!(), + } + }; + match &res { + Ok(index) => { + self.proposal_control_mut() + .record_proposed_admin(cmd_type, *index); + if self.proposal_control_mut().has_uncommitted_admin() { + self.raft_group_mut().skip_bcast_commit(false); + } + } + Err(e) => { + info!( + self.logger, + "failed to propose admin command"; + "cmd_type" => ?cmd_type, + "error" => ?e, + ); + } + } + self.post_propose_command(ctx, res, vec![ch], true); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs new file mode 100644 index 00000000000..4c6fdad3aa2 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -0,0 +1,1237 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains batch split related processing logic. +//! +//! Process Overview +//! +//! Propose: +//! - Nothing special except for validating batch split requests (ex: split keys +//! are in ascending order). +//! +//! Apply: +//! - apply_batch_split: Create and initialize metapb::region for split regions +//! and derived regions. Then, create checkpoints of the current talbet for +//! split regions and derived region to make tablet physical isolated. Update +//! the parent region's region state without persistency. Send the new regions +//! (including derived region) back to raftstore. +//! +//! On Apply Result: +//! - on_ready_split_region: Update the relevant in memory meta info of the +//! parent peer, then send to the store the relevant info needed to create and +//! initialize the split regions. +//! +//! Split peer creation and initlization: +//! - on_split_init: In normal cases, the uninitialized split region will be +//! created by the store, and here init it using the data sent from the parent +//! peer. + +use std::{any::Any, borrow::Cow, cmp, path::PathBuf}; + +use collections::HashSet; +use crossbeam::channel::SendError; +use engine_traits::{ + Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, +}; +use fail::fail_point; +use kvproto::{ + metapb::{self, Region, RegionEpoch}, + pdpb::CheckPolicy, + raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, + raft_serverpb::RaftSnapshotData, +}; +use protobuf::Message; +use raft::{prelude::Snapshot, INVALID_ID}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{ + cmd_resp, + fsm::{apply::validate_batch_split, ApplyMetrics}, + metrics::PEER_ADMIN_CMD_COUNTER, + snap::TABLET_SNAPSHOT_VERSION, + util::{self, KeysInfoFormatter}, + PeerPessimisticLocks, SplitCheckTask, Transport, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + }, + Result, +}; +use slog::{error, info, warn}; +use tikv_util::{log::SlogFormat, slog_panic, time::Instant}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, PeerFsmDelegate}, + operation::{AdminCmdResult, SharedReadTablet}, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, + worker::tablet, + Error, +}; + +pub const SPLIT_PREFIX: &str = "split"; + +#[derive(Debug)] +pub struct SplitResult { + pub regions: Vec, + // The index of the derived region in `regions` + pub derived_index: usize, + pub tablet_index: u64, + // Hack: in common case we should use generic, but split is an infrequent + // event that performance is not critical. And using `Any` can avoid polluting + // all existing code. + tablet: Box, +} + +#[derive(Debug)] +pub struct SplitInit { + /// Split region + pub region: metapb::Region, + pub check_split: bool, + pub scheduled: bool, + pub derived_leader: bool, + pub derived_region_id: u64, + + /// In-memory pessimistic locks that should be inherited from parent region + pub locks: PeerPessimisticLocks, + approximate_size: Option, + approximate_keys: Option, +} + +impl SplitInit { + fn to_snapshot(&self) -> Snapshot { + let mut snapshot = Snapshot::default(); + // Set snapshot metadata. + snapshot.mut_metadata().set_term(RAFT_INIT_LOG_TERM); + snapshot.mut_metadata().set_index(RAFT_INIT_LOG_INDEX); + let conf_state = util::conf_state_from_region(&self.region); + snapshot.mut_metadata().set_conf_state(conf_state); + // Set snapshot data. + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(self.region.clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_balance(false); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + snapshot + } +} + +pub fn report_split_init_finish( + ctx: &mut StoreContext, + derived_region_id: u64, + finish_region_id: u64, + cleanup: bool, +) where + EK: KvEngine, + ER: RaftEngine, +{ + let _ = ctx.router.force_send( + derived_region_id, + PeerMsg::SplitInitFinish(finish_region_id), + ); + if !cleanup { + return; + } + + if let Err(e) = ctx + .schedulers + .tablet + .schedule(tablet::Task::direct_destroy_path(temp_split_path( + &ctx.tablet_registry, + finish_region_id, + ))) + { + error!(ctx.logger, "failed to destroy split init temp"; "error" => ?e); + } +} + +#[derive(Debug)] +pub struct RequestSplit { + pub epoch: RegionEpoch, + pub split_keys: Vec>, + pub source: Cow<'static, str>, +} + +#[derive(Debug)] +pub struct RequestHalfSplit { + pub epoch: RegionEpoch, + pub start_key: Option>, + pub end_key: Option>, + pub policy: CheckPolicy, + pub source: Cow<'static, str>, +} + +#[derive(Default, Debug)] +pub struct SplitFlowControl { + size_diff_hint: i64, + skip_split_count: u64, + may_skip_split_check: bool, + approximate_size: Option, + approximate_keys: Option, +} + +impl SplitFlowControl { + #[inline] + pub fn approximate_size(&self) -> Option { + self.approximate_size + } + + #[inline] + pub fn approximate_keys(&self) -> Option { + self.approximate_keys + } +} + +pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { + let tablet_name = registry.tablet_name(SPLIT_PREFIX, region_id, RAFT_INIT_LOG_INDEX); + registry.tablet_root().join(tablet_name) +} + +impl PeerFsmDelegate<'_, EK, ER, T> { + pub fn on_split_region_check(&mut self) { + if !self.fsm.peer_mut().on_split_region_check(self.store_ctx) { + self.schedule_tick(PeerTick::SplitRegionCheck); + } + } +} + +impl Peer { + /// Handle split check. + /// + /// Returns true means the check tick is consumed, no need to schedule + /// another tick. + pub fn on_split_region_check(&mut self, ctx: &mut StoreContext) -> bool { + if !self.is_leader() { + return true; + } + let is_generating_snapshot = self.storage().is_generating_snapshot(); + let control = self.split_flow_control_mut(); + if control.may_skip_split_check + && control.size_diff_hint < ctx.cfg.region_split_check_diff().0 as i64 + { + return true; + } + if ctx.schedulers.split_check.is_busy() { + return false; + } + if is_generating_snapshot && control.skip_split_count < 3 { + control.skip_split_count += 1; + return false; + } + // todo: the suspected buckets range should generated by the diff write bytes. + // it will be done in next pr. + let task = SplitCheckTask::split_check( + self.region().clone(), + true, + CheckPolicy::Scan, + self.gen_bucket_range_for_update(ctx), + ); + if let Err(e) = ctx.schedulers.split_check.schedule(task) { + info!(self.logger, "failed to schedule split check"; "err" => ?e); + } + let control = self.split_flow_control_mut(); + control.may_skip_split_check = true; + control.size_diff_hint = 0; + control.skip_split_count = 0; + false + } + + pub fn on_update_region_size(&mut self, size: u64) { + self.split_flow_control_mut().approximate_size = Some(size); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_update_region_keys(&mut self, keys: u64) { + self.split_flow_control_mut().approximate_keys = Some(keys); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_clear_region_size(&mut self) { + let control = self.split_flow_control_mut(); + control.approximate_size.take(); + control.approximate_keys.take(); + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + + pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { + let control = self.split_flow_control_mut(); + control.size_diff_hint += metrics.size_diff_hint; + if self.is_leader() { + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + } + + pub fn force_split_check(&mut self, ctx: &mut StoreContext) { + let control = self.split_flow_control_mut(); + control.size_diff_hint = ctx.cfg.region_split_check_diff().0 as i64; + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + + pub fn on_request_split( + &mut self, + ctx: &mut StoreContext, + rs: RequestSplit, + ch: CmdResChannel, + ) { + info!( + self.logger, + "on split"; + "split_keys" => %KeysInfoFormatter(rs.split_keys.iter()), + "source" => %&rs.source, + ); + if !self.is_leader() { + // region on this store is no longer leader, skipped. + info!(self.logger, "not leader, skip."); + ch.set_result(cmd_resp::new_error(Error::NotLeader( + self.region_id(), + self.leader(), + ))); + return; + } + if let Err(e) = util::validate_split_region( + self.region_id(), + self.peer_id(), + self.region(), + &rs.epoch, + &rs.split_keys, + ) { + info!(self.logger, "invalid split request"; "err" => ?e, "source" => %&rs.source); + ch.set_result(cmd_resp::new_error(e)); + return; + } + self.ask_batch_split_pd(ctx, rs.split_keys, ch); + } + + pub fn on_request_half_split( + &mut self, + ctx: &mut StoreContext, + rhs: RequestHalfSplit, + _ch: CmdResChannel, + ) { + let is_key_range = rhs.start_key.is_some() && rhs.end_key.is_some(); + info!( + self.logger, + "on half split"; + "is_key_range" => is_key_range, + "policy" => ?rhs.policy, + "source" => ?rhs.source, + ); + if !self.is_leader() { + // region on this store is no longer leader, skipped. + info!(self.logger, "not leader, skip."); + return; + } + + let region = self.region(); + if util::is_epoch_stale(&rhs.epoch, region.get_region_epoch()) { + warn!( + self.logger, + "receive a stale halfsplit message"; + "is_key_range" => is_key_range, + ); + return; + } + + // Do not check the bucket ranges if we want to split the region with a given + // key range, this is to avoid compatibility issues. + let split_check_bucket_ranges = if !is_key_range { + self.gen_bucket_range_for_update(ctx) + } else { + None + }; + + let task = SplitCheckTask::split_check_key_range( + region.clone(), + rhs.start_key, + rhs.end_key, + false, + rhs.policy, + split_check_bucket_ranges, + ); + if let Err(e) = ctx.schedulers.split_check.schedule(task) { + error!( + self.logger, + "failed to schedule split check"; + "is_key_range" => is_key_range, + "err" => %e, + ); + } + } + + pub fn propose_split( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + // We rely on ConflictChecker to detect conflicts, so no need to set proposal + // context. + let data = req.write_to_bytes().unwrap(); + self.propose(store_ctx, data) + } +} + +impl Apply { + pub fn apply_split( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + info!( + self.logger, + "split is deprecated, redirect to use batch split"; + ); + let split = req.get_split().to_owned(); + let mut admin_req = AdminRequest::default(); + admin_req + .mut_splits() + .set_right_derive(split.get_right_derive()); + admin_req.mut_splits().mut_requests().push(split); + // This method is executed only when there are unapplied entries after being + // restarted. So there will be no callback, it's OK to return a response + // that does not matched with its request. + self.apply_batch_split(req, log_index) + } + + pub fn apply_batch_split( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + fail_point!( + "on_apply_batch_split", + self.peer().get_store_id() == 3, + |_| { unreachable!() } + ); + PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); + + let region = self.region(); + let region_id = region.get_id(); + validate_batch_split(req, self.region())?; + + let mut boundaries: Vec<&[u8]> = Vec::default(); + boundaries.push(self.region().get_start_key()); + for req in req.get_splits().get_requests() { + boundaries.push(req.get_split_key()); + } + boundaries.push(self.region().get_end_key()); + + info!( + self.logger, + "split region"; + "region" => ?region, + "index" => log_index, + "boundaries" => %KeysInfoFormatter(boundaries.iter()), + ); + + let split_reqs = req.get_splits(); + let new_region_cnt = split_reqs.get_requests().len(); + let new_version = region.get_region_epoch().get_version() + new_region_cnt as u64; + + let mut derived_req = SplitRequest::default(); + derived_req.new_region_id = region.id; + let derived_req = &[derived_req]; + + let right_derive = split_reqs.get_right_derive(); + let reqs = if right_derive { + split_reqs.get_requests().iter().chain(derived_req) + } else { + derived_req.iter().chain(split_reqs.get_requests()) + }; + + let regions: Vec<_> = boundaries + .array_windows::<2>() + .zip(reqs) + .map(|([start_key, end_key], req)| { + let mut new_region = Region::default(); + new_region.set_id(req.get_new_region_id()); + new_region.set_region_epoch(region.get_region_epoch().to_owned()); + new_region.mut_region_epoch().set_version(new_version); + new_region.set_start_key(start_key.to_vec()); + new_region.set_end_key(end_key.to_vec()); + new_region.set_peers(region.get_peers().to_vec().into()); + // If the `req` is the `derived_req`, the peers are already set correctly and + // the following loop will not be executed due to the empty `new_peer_ids` in + // the `derived_req` + for (peer, peer_id) in new_region + .mut_peers() + .iter_mut() + .zip(req.get_new_peer_ids()) + { + peer.set_id(*peer_id); + } + new_region + }) + .collect(); + + let derived_index = if right_derive { regions.len() - 1 } else { 0 }; + + // We will create checkpoint of the current tablet for both derived region and + // split regions. Before the creation, we should flush the writes and remove the + // write batch + self.flush(); + + // todo(SpadeA): Here: we use a temporary solution that we use checkpoint API to + // clone new tablets. It may cause large jitter as we need to flush the + // memtable. And more what is more important is that after removing WAL, the API + // will never flush. + // We will freeze the memtable rather than flush it in the following PR. + let tablet = self.tablet().clone(); + let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "error" => ?e + ) + }); + + let now = Instant::now(); + let reg = self.tablet_registry(); + for new_region in ®ions { + let new_region_id = new_region.id; + if new_region_id == region_id { + continue; + } + + let split_temp_path = temp_split_path(reg, new_region_id); + checkpointer + .create_at(&split_temp_path, None, 0) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %split_temp_path.display(), + "error" => ?e + ) + }); + } + + let derived_path = self.tablet_registry().tablet_path(region_id, log_index); + // If it's recovered from restart, it's possible the target path exists already. + // And because checkpoint is atomic, so we don't need to worry about corruption. + // And it's also wrong to delete it and remake as it may has applied and flushed + // some data to the new checkpoint before being restarted. + if !derived_path.exists() { + checkpointer + .create_at(&derived_path, None, 0) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %derived_path.display(), + "error" => ?e + ) + }); + } + let elapsed = now.saturating_elapsed(); + // to be removed after when it's stable + info!( + self.logger, + "create checkpoint time consumes"; + "region" => ?self.region(), + "duration" => ?elapsed + ); + + let reg = self.tablet_registry(); + let path = reg.tablet_path(region_id, log_index); + let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); + // Now the tablet is flushed, so all previous states should be persisted. + // Reusing the tablet should not be a problem. + // TODO: Should we avoid flushing for the old tablet? + ctx.flush_state = Some(self.flush_state().clone()); + let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); + self.set_tablet(tablet.clone()); + + self.region_state_mut() + .set_region(regions[derived_index].clone()); + self.region_state_mut().set_tablet_index(log_index); + + let mut resp = AdminResponse::default(); + resp.mut_splits().set_regions(regions.clone().into()); + PEER_ADMIN_CMD_COUNTER.batch_split.success.inc(); + + Ok(( + resp, + AdminCmdResult::SplitRegion(SplitResult { + regions, + derived_index, + tablet_index: log_index, + tablet: Box::new(tablet), + }), + )) + } +} + +impl Peer { + pub fn on_apply_res_split( + &mut self, + store_ctx: &mut StoreContext, + res: SplitResult, + ) { + fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); + + let derived = &res.regions[res.derived_index]; + let region_id = derived.get_id(); + + let region_locks = self.txn_context().split(&res.regions, derived); + fail_point!("on_split_invalidate_locks"); + + let tablet: EK = match res.tablet.downcast() { + Ok(t) => *t, + Err(t) => unreachable!("tablet type should be the same: {:?}", t), + }; + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + meta.set_region(derived, true, &self.logger); + let (reader, read_tablet) = meta.readers.get_mut(&derived.get_id()).unwrap(); + self.set_region( + &store_ctx.coprocessor_host, + reader, + derived.clone(), + RegionChangeReason::Split, + res.tablet_index, + ); + + // Tablet should be updated in lock to match the epoch. + *read_tablet = SharedReadTablet::new(tablet.clone()); + } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(store_ctx, tablet, res.tablet_index); + } + + let new_region_count = res.regions.len() as u64; + let control = self.split_flow_control_mut(); + let estimated_size = control.approximate_size.map(|v| v / new_region_count); + let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + + self.post_split(); + + if self.is_leader() { + self.region_heartbeat_pd(store_ctx); + // Notify pd immediately to let it update the region meta. + info!( + self.logger, + "notify pd with split"; + "split_count" => res.regions.len(), + ); + // Now pd only uses ReportBatchSplit for history operation show, + // so we send it independently here. + self.report_batch_split_pd(store_ctx, res.regions.to_vec()); + // After split, the peer may need to update its metrics. + let control = self.split_flow_control_mut(); + control.may_skip_split_check = false; + control.approximate_size = estimated_size; + control.approximate_keys = estimated_keys; + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + self.storage_mut().set_has_dirty_data(true); + + fail_point!("before_cluster_shutdown1"); + let mailbox = { + match store_ctx.router.mailbox(self.region_id()) { + Some(mailbox) => mailbox, + None => { + // None means the node is shutdown concurrently and thus the + // mailboxes in router have been cleared + assert!( + store_ctx.router.is_shutdown(), + "{} router should have been closed", + SlogFormat(&self.logger) + ); + return; + } + } + }; + let tablet_index = res.tablet_index; + let _ = store_ctx.schedulers.tablet.schedule(tablet::Task::trim( + self.tablet().unwrap().clone(), + derived, + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + + let last_region_id = res.regions.last().unwrap().get_id(); + let mut new_ids = HashSet::default(); + for (new_region, locks) in res.regions.into_iter().zip(region_locks) { + let new_region_id = new_region.get_id(); + if new_region_id == region_id { + continue; + } + + new_ids.insert(new_region_id); + let split_init = PeerMsg::SplitInit(Box::new(SplitInit { + region: new_region, + derived_leader: self.is_leader(), + derived_region_id: region_id, + check_split: last_region_id == new_region_id, + scheduled: false, + approximate_size: estimated_size, + approximate_keys: estimated_keys, + locks, + })); + + // First, send init msg to peer directly. Returning error means the peer is not + // existed in which case we should redirect it to the store. + match store_ctx.router.force_send(new_region_id, split_init) { + Ok(_) => {} + Err(SendError(PeerMsg::SplitInit(msg))) => { + fail_point!("before_cluster_shutdown2", |_| {}); + if let Err(e) = store_ctx + .router + .force_send_control(StoreMsg::SplitInit(msg)) + { + if store_ctx.router.is_shutdown() { + return; + } + slog_panic!( + self.logger, + "fails to send split peer intialization msg to store"; + "error" => ?e, + ); + } + } + _ => unreachable!(), + } + } + self.split_trace_mut().push((res.tablet_index, new_ids)); + let region_state = self.storage().region_state().clone(); + self.state_changes_mut() + .put_region_state(region_id, res.tablet_index, ®ion_state) + .unwrap(); + self.state_changes_mut() + .put_dirty_mark(region_id, res.tablet_index, true) + .unwrap(); + self.set_has_extra_write(); + } + + pub fn on_split_init( + &mut self, + store_ctx: &mut StoreContext, + mut split_init: Box, + ) { + let region_id = split_init.region.id; + let peer_id = split_init + .region + .get_peers() + .iter() + .find(|p| p.get_store_id() == self.peer().get_store_id()) + .unwrap() + .get_id(); + + // If peer_id in `split_init` is less than the current peer_id, the conf change + // for the peer should have occurred and we should just report finish to + // the source region of this out of dated peer initialization. + if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX + || peer_id < self.peer().get_id() + { + // Race with split operation. The tablet created by split will eventually be + // deleted. We don't trim it. + report_split_init_finish(store_ctx, split_init.derived_region_id, region_id, true); + return; + } + + if self.storage().is_initialized() || self.raft_group().snap().is_some() { + // It accepts a snapshot already but not finish applied yet. + let prev = self.storage_mut().split_init_mut().replace(split_init); + assert!(prev.is_none(), "{:?}", prev); + return; + } + + split_init.scheduled = true; + let snap = split_init.to_snapshot(); + let mut msg = raft::eraftpb::Message::default(); + msg.set_to(self.peer_id()); + msg.set_from(self.leader_id()); + msg.set_msg_type(raft::eraftpb::MessageType::MsgSnapshot); + msg.set_snapshot(snap); + msg.set_term(cmp::max(self.term(), RAFT_INIT_LOG_TERM)); + let res = self.raft_group_mut().step(msg); + let accept_snap = self.raft_group().snap().is_some(); + if res.is_err() || !accept_snap { + slog_panic!( + self.logger, + "failed to accept snapshot"; + "accept_snapshot" => accept_snap, + "res" => ?res, + ); + } + let prev = self.storage_mut().split_init_mut().replace(split_init); + assert!(prev.is_none(), "{:?}", prev); + self.set_has_ready(); + } + + pub fn post_split_init( + &mut self, + store_ctx: &mut StoreContext, + split_init: Box, + ) { + let region_id = self.region_id(); + if self.storage().has_dirty_data() { + let tablet_index = self.storage().tablet_index(); + if let Some(mailbox) = store_ctx.router.mailbox(region_id) { + let _ = store_ctx.schedulers.tablet.schedule(tablet::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } else { + // None means the node is shutdown concurrently and thus the + // mailboxes in router have been cleared + assert!( + store_ctx.router.is_shutdown(), + "{} router should have been closed", + SlogFormat(&self.logger) + ); + return; + } + } + if split_init.derived_leader + && self.leader_id() == INVALID_ID + && self.term() == RAFT_INIT_LOG_TERM + { + let _ = self.raft_group_mut().campaign(); + self.set_has_ready(); + + self.txn_context().init_with_lock(split_init.locks); + let control = self.split_flow_control_mut(); + control.approximate_size = split_init.approximate_size; + control.approximate_keys = split_init.approximate_keys; + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. + self.region_heartbeat_pd(store_ctx); + } + + if split_init.check_split { + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + report_split_init_finish(store_ctx, split_init.derived_region_id, region_id, false); + } + + pub fn on_split_init_finish(&mut self, region_id: u64) { + let mut found = false; + for (_, ids) in self.split_trace_mut() { + if ids.remove(®ion_id) { + found = true; + break; + } + } + assert!(found, "{} {}", SlogFormat(&self.logger), region_id); + let split_trace = self.split_trace_mut(); + let mut off = 0; + let mut admin_flushed = 0; + for (tablet_index, ids) in split_trace.iter() { + if !ids.is_empty() { + break; + } + admin_flushed = *tablet_index; + off += 1; + } + if off > 0 { + // There should be very few elements in the vector. + split_trace.drain(..off); + assert_ne!(admin_flushed, 0); + self.storage_mut() + .apply_trace_mut() + .on_admin_flush(admin_flushed); + // Persist admin flushed. + self.set_has_extra_write(); + } + } + + pub fn on_tablet_trimmed(&mut self, tablet_index: u64) { + info!(self.logger, "tablet is trimmed"; "tablet_index" => tablet_index); + let region_id = self.region_id(); + let changes = self.state_changes_mut(); + changes + .put_dirty_mark(region_id, tablet_index, false) + .unwrap(); + self.set_has_extra_write(); + if self.storage().tablet_index() == tablet_index { + self.storage_mut().set_has_dirty_data(false); + } + } +} + +#[cfg(test)] +mod test { + use std::sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }; + + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactory}, + }; + use engine_traits::{ + FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, + }; + use kvproto::{ + metapb::RegionEpoch, + raft_cmdpb::{BatchSplitRequest, SplitRequest}, + raft_serverpb::{PeerState, RegionLocalState}, + }; + use raftstore::{ + coprocessor::CoprocessorHost, + store::{cmd_resp::new_error, Config}, + }; + use slog::o; + use tempfile::TempDir; + use tikv_util::{ + store::{new_learner_peer, new_peer}, + worker::dummy_scheduler, + }; + + use super::*; + use crate::{ + fsm::ApplyResReporter, + operation::{test_util::create_tmp_importer, CatchUpLogs}, + raft::Apply, + router::ApplyRes, + }; + + struct MockReporter { + sender: Sender, + } + + impl MockReporter { + fn new() -> (Self, Receiver) { + let (tx, rx) = channel(); + (MockReporter { sender: tx }, rx) + } + } + + impl ApplyResReporter for MockReporter { + fn report(&self, apply_res: ApplyRes) { + let _ = self.sender.send(apply_res); + } + + fn redirect_catch_up_logs(&self, _c: CatchUpLogs) {} + } + + fn new_split_req(key: &[u8], id: u64, children: Vec) -> SplitRequest { + let mut req = SplitRequest::default(); + req.set_split_key(key.to_vec()); + req.set_new_region_id(id); + req.set_new_peer_ids(children); + req + } + + fn assert_split( + apply: &mut Apply, + parent_id: u64, + right_derived: bool, + new_region_ids: Vec, + split_keys: Vec>, + children_peers: Vec>, + log_index: u64, + region_boundries: Vec<(Vec, Vec)>, + expected_region_epoch: RegionEpoch, + expected_derived_index: usize, + ) { + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(right_derived); + + for ((new_region_id, children), split_key) in new_region_ids + .into_iter() + .zip(children_peers.clone()) + .zip(split_keys) + { + splits + .mut_requests() + .push(new_split_req(&split_key, new_region_id, children)); + } + + let mut req = AdminRequest::default(); + req.set_splits(splits); + + // Exec batch split + let (resp, apply_res) = apply.apply_batch_split(&req, log_index).unwrap(); + + let regions = resp.get_splits().get_regions(); + assert!(regions.len() == region_boundries.len()); + + let mut child_idx = 0; + for (i, region) in regions.iter().enumerate() { + assert_eq!(region.get_start_key().to_vec(), region_boundries[i].0); + assert_eq!(region.get_end_key().to_vec(), region_boundries[i].1); + assert_eq!(*region.get_region_epoch(), expected_region_epoch); + + if region.id == parent_id { + let state = apply.region_state(); + assert_eq!(state.tablet_index, log_index); + assert_eq!(state.get_region(), region); + let reg = apply.tablet_registry(); + let tablet_path = reg.tablet_path(region.id, log_index); + assert!(reg.tablet_factory().exists(&tablet_path)); + + match apply_res { + AdminCmdResult::SplitRegion(SplitResult { + derived_index, + tablet_index, + .. + }) => { + assert_eq!(expected_derived_index, derived_index); + assert_eq!(tablet_index, log_index); + } + _ => panic!(), + } + } else { + assert_eq! { + region.get_peers().iter().map(|peer| peer.id).collect::>(), + children_peers[child_idx] + } + child_idx += 1; + + let reg = apply.tablet_registry(); + let tablet_name = reg.tablet_name(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); + let path = reg.tablet_root().join(tablet_name); + assert!(reg.tablet_factory().exists(&path)); + } + } + } + + #[test] + fn test_split() { + let store_id = 2; + + let mut region = Region::default(); + region.set_id(1); + region.set_end_key(b"k10".to_vec()); + region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3), new_peer(4, 5), new_learner_peer(6, 7)]; + region.set_peers(peers.into()); + + let logger = slog_global::borrow_global().new(o!()); + let path = TempDir::new().unwrap(); + let cf_opts = DATA_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + let ctx = TabletContext::new(®ion, Some(5)); + reg.load(ctx, true).unwrap(); + + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region.clone()); + region_state.set_tablet_index(5); + + let (read_scheduler, _rx) = dummy_scheduler(); + let (reporter, _) = MockReporter::new(); + let (_tmp_dir, importer) = create_tmp_importer(); + let host = CoprocessorHost::::default(); + let mut apply = Apply::new( + &Config::default(), + region + .get_peers() + .iter() + .find(|p| p.store_id == store_id) + .unwrap() + .clone(), + region_state, + reporter, + reg, + read_scheduler, + Arc::new(FlushState::new(5)), + None, + 5, + None, + importer, + host, + logger.clone(), + ); + + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(true); + splits.mut_requests().push(new_split_req(b"k1", 1, vec![])); + let mut req = AdminRequest::default(); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // 3 followers are required. + assert!(err.to_string().contains("invalid new peer id count")); + + splits.mut_requests().clear(); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 6).unwrap_err(); + // Empty requests should be rejected. + assert!(err.to_string().contains("missing split requests")); + + splits + .mut_requests() + .push(new_split_req(b"k11", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let resp = new_error(apply.apply_batch_split(&req, 0).unwrap_err()); + // Out of range keys should be rejected. + assert!( + resp.get_header().get_error().has_key_not_in_region(), + "{:?}", + resp + ); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 7).unwrap_err(); + // Empty key will not in any region exclusively. + assert!(err.to_string().contains("missing split key"), "{:?}", err); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k2", 1, vec![11, 12, 13])); + splits + .mut_requests() + .push(new_split_req(b"k1", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 8).unwrap_err(); + // keys should be in ascend order. + assert!( + err.to_string().contains("invalid split request"), + "{:?}", + err + ); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k1", 1, vec![11, 12, 13])); + splits + .mut_requests() + .push(new_split_req(b"k2", 1, vec![11, 12])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 9).unwrap_err(); + // All requests should be checked. + assert!(err.to_string().contains("id count"), "{:?}", err); + + let cases = vec![ + // region 1["", "k10"] + // After split: region 1 ["", "k09"], + // region 10 ["k09", "k10"] + ( + 1, + false, + vec![10], + vec![b"k09".to_vec()], + vec![vec![11, 12, 13]], + 10, + vec![ + (b"".to_vec(), b"k09".to_vec()), + (b"k09".to_vec(), b"k10".to_vec()), + ], + 4, + 0, + ), + // region 1 ["", "k09"] + // After split: region 20 ["", "k01"], + // region 1 ["k01", "k09"] + ( + 1, + true, + vec![20], + vec![b"k01".to_vec()], + vec![vec![21, 22, 23]], + 20, + vec![ + (b"".to_vec(), b"k01".to_vec()), + (b"k01".to_vec(), b"k09".to_vec()), + ], + 5, + 1, + ), + // region 1 ["k01", "k09"] + // After split: region 30 ["k01", "k02"], + // region 40 ["k02", "k03"], + // region 1 ["k03", "k09"] + ( + 1, + true, + vec![30, 40], + vec![b"k02".to_vec(), b"k03".to_vec()], + vec![vec![31, 32, 33], vec![41, 42, 43]], + 30, + vec![ + (b"k01".to_vec(), b"k02".to_vec()), + (b"k02".to_vec(), b"k03".to_vec()), + (b"k03".to_vec(), b"k09".to_vec()), + ], + 7, + 2, + ), + // region 1 ["k03", "k09"] + // After split: region 1 ["k03", "k07"], + // region 50 ["k07", "k08"], + // region 60 ["k08", "k09"] + ( + 1, + false, + vec![50, 60], + vec![b"k07".to_vec(), b"k08".to_vec()], + vec![vec![51, 52, 53], vec![61, 62, 63]], + 40, + vec![ + (b"k03".to_vec(), b"k07".to_vec()), + (b"k07".to_vec(), b"k08".to_vec()), + (b"k08".to_vec(), b"k09".to_vec()), + ], + 9, + 0, + ), + ]; + + for ( + parent_id, + right_derive, + new_region_ids, + split_keys, + children_peers, + log_index, + region_boundries, + version, + expected_derived_index, + ) in cases + { + let mut expected_epoch = RegionEpoch::new(); + expected_epoch.set_version(version); + + assert_split( + &mut apply, + parent_id, + right_derive, + new_region_ids, + split_keys, + children_peers, + log_index, + region_boundries, + expected_epoch, + expected_derived_index, + ); + } + + // Split will create checkpoint tablet, so if there are some writes before + // split, they should be flushed immediately. + apply.apply_put(CF_DEFAULT, 50, b"k04", b"v4").unwrap(); + apply.apply_flow_control_mut().set_need_flush(true); + assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k05", 70, vec![71, 72, 73])); + req.set_splits(splits); + apply.apply_batch_split(&req, 51).unwrap(); + assert!(apply.write_batch.is_none()); + assert_eq!( + apply + .tablet() + .get_value(&keys::data_key(b"k04")) + .unwrap() + .unwrap(), + b"v4" + ); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs new file mode 100644 index 00000000000..e7bd84c973c --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -0,0 +1,329 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use bytes::Bytes; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + disk_usage::DiskUsage, + metapb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, TransferLeaderRequest, + }, +}; +use raft::{eraftpb, ProgressState, Storage}; +use raftstore::{ + store::{ + fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, + Transport, TRANSFER_LEADER_COMMAND_REPLY_CTX, + }, + Result, +}; +use rand::prelude::SliceRandom; +use slog::info; +use txn_types::WriteBatchFlags; + +use super::AdminCmdResult; +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg}, +}; + +fn transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { + if !msg.has_admin_request() { + return None; + } + let req = msg.get_admin_request(); + if !req.has_transfer_leader() { + return None; + } + + Some(req.get_transfer_leader()) +} + +impl Peer { + /// Return true if the transfer leader request is accepted. + /// + /// When transferring leadership begins, leader sends a pre-transfer + /// to target follower first to ensures it's ready to become leader. + /// After that the real transfer leader process begin. + /// + /// 1. pre_transfer_leader on leader: + /// Leader will send a MsgTransferLeader to follower. + /// 2. execute_transfer_leader on follower + /// If follower passes all necessary checks, it will reply an + /// ACK with type MsgTransferLeader and its promised applied index. + /// 3. ready_to_transfer_leader on leader: + /// Leader checks if it's appropriate to transfer leadership. If it + /// does, it calls raft transfer_leader API to do the remaining work. + /// + /// Additional steps when there are remaining pessimistic + /// locks to propose (detected in function on_transfer_leader_msg). + /// 1. Leader firstly proposes pessimistic locks and then proposes a + /// TransferLeader command. + /// 2. The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// + /// See also: tikv/rfcs#37. + pub fn propose_transfer_leader( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ch: CmdResChannel, + ) -> bool { + ctx.raft_metrics.propose.transfer_leader.inc(); + + let transfer_leader = transfer_leader_cmd(&req).unwrap(); + let prs = self.raft_group().raft.prs(); + + // Find the target with the largest matched index among the candidate + // transferee peers + let (_, peers) = transfer_leader + .get_peers() + .iter() + .filter(|peer| peer.id != self.peer().id) + .fold((0, vec![]), |(max_matched, mut chosen), p| { + if let Some(pr) = prs.get(p.id) { + match pr.matched.cmp(&max_matched) { + Ordering::Greater => (pr.matched, vec![p]), + Ordering::Equal => { + chosen.push(p); + (max_matched, chosen) + } + Ordering::Less => (max_matched, chosen), + } + } else { + (max_matched, chosen) + } + }); + let peer = match peers.len() { + 0 => transfer_leader.get_peer(), + 1 => peers.get(0).unwrap(), + _ => peers.choose(&mut rand::thread_rng()).unwrap(), + }; + + let transferee = if peer.id == self.peer_id() { + false + } else { + self.pre_transfer_leader(peer) + }; + + // transfer leader command doesn't need to replicate log and apply, so we + // return immediately. Note that this command may fail, we can view it just as + // an advice + ch.set_result(make_transfer_leader_response()); + + transferee + } + + fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + if self.raft_group().raft.has_pending_conf() { + info!( + self.logger, + "reject transfer leader due to pending conf change"; + "peer" => ?peer, + ); + return false; + } + + // Broadcast heartbeat to make sure followers commit the entries immediately. + // It's only necessary to ping the target peer, but ping all for simplicity. + self.raft_group_mut().ping(); + + // todo: entry cache warmup + + let mut msg = eraftpb::Message::new(); + msg.set_to(peer.get_id()); + msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); + msg.set_from(self.peer_id()); + // log term here represents the term of last log. For leader, the term of last + // log is always its current term. Not just set term because raft library + // forbids setting it for MsgTransferLeader messages. + msg.set_log_term(self.term()); + self.raft_group_mut().raft.msgs.push(msg); + true + } + + pub fn on_transfer_leader_msg( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + peer_disk_usage: DiskUsage, + ) { + // log_term is set by original leader, represents the term last log is written + // in, which should be equal to the original leader's term. + if msg.get_log_term() != self.term() { + return; + } + + if !self.is_leader() { + self.execute_transfer_leader(ctx, msg.get_from(), peer_disk_usage, false); + } else { + let from = match self.peer_from_cache(msg.get_from()) { + Some(p) => p, + None => return, + }; + match self.ready_to_transfer_leader(ctx, msg.get_index(), &from) { + Some(reason) => { + info!( + self.logger, + "reject to transfer leader"; + "to" => ?from, + "reason" => reason, + "index" => msg.get_index(), + "last_index" => self.storage().last_index().unwrap_or_default(), + ); + } + None => { + self.propose_pending_writes(ctx); + if self.propose_locks_before_transfer_leader(ctx, msg) { + // If some pessimistic locks are just proposed, we propose another + // TransferLeader command instead of transferring leader immediately. + info!( + self.logger, + "propose transfer leader command"; + "to" => ?from, + ); + let mut cmd = + new_admin_request(self.region().get_id(), self.peer().clone()); + cmd.mut_header() + .set_region_epoch(self.region().get_region_epoch().clone()); + // Set this flag to propose this command like a normal proposal. + cmd.mut_header() + .set_flags(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL.bits()); + cmd.mut_admin_request() + .set_cmd_type(AdminCmdType::TransferLeader); + cmd.mut_admin_request().mut_transfer_leader().set_peer(from); + if let PeerMsg::AdminCommand(req) = PeerMsg::admin_command(cmd).0 { + self.on_admin_command(ctx, req.request, req.ch); + } else { + unreachable!(); + } + } else { + info!( + self.logger, + "transfer leader"; + "peer" => ?from, + ); + self.raft_group_mut().transfer_leader(from.get_id()); + self.refresh_leader_transferee(); + } + } + } + } + } + + pub fn execute_transfer_leader( + &mut self, + ctx: &mut StoreContext, + from: u64, + peer_disk_usage: DiskUsage, + reply_cmd: bool, // whether it is a reply to a TransferLeader command + ) { + let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); + if pending_snapshot + || from != self.leader_id() + // Transfer leader to node with disk full will lead to write availablity downback. + // But if the current leader is disk full, and send such request, we should allow it, + // because it may be a read leader balance request. + || (!matches!(ctx.self_disk_usage, DiskUsage::Normal) && + matches!(peer_disk_usage,DiskUsage::Normal)) + { + info!( + self.logger, + "reject transferring leader"; + "from" => from, + "pending_snapshot" => pending_snapshot, + "disk_usage" => ?ctx.self_disk_usage, + ); + return; + } + + let mut msg = eraftpb::Message::new(); + msg.set_from(self.peer_id()); + msg.set_to(self.leader_id()); + msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); + msg.set_index(self.storage().apply_state().applied_index); + msg.set_log_term(self.term()); + if reply_cmd { + msg.set_context(Bytes::from_static(TRANSFER_LEADER_COMMAND_REPLY_CTX)); + } + self.raft_group_mut().raft.msgs.push(msg); + } + + fn ready_to_transfer_leader( + &self, + ctx: &mut StoreContext, + mut index: u64, + peer: &metapb::Peer, + ) -> Option<&'static str> { + let status = self.raft_group().status(); + let progress = status.progress.unwrap(); + + if !progress.conf().voters().contains(peer.id) { + return Some("non voter"); + } + + for (id, pr) in progress.iter() { + if pr.state == ProgressState::Snapshot { + return Some("pending snapshot"); + } + if *id == peer.id && index == 0 { + // index will be zero if it's sent from an instance without + // pre-transfer-leader feature. Set it to matched to make it + // possible to transfer leader to an older version. It may be + // useful during rolling restart. + index = pr.matched; + } + } + + if self.raft_group().raft.has_pending_conf() + || self.raft_group().raft.pending_conf_index > index + { + return Some("pending conf change"); + } + + if self.storage().last_index().unwrap_or_default() + >= index + ctx.cfg.leader_transfer_max_log_lag + { + return Some("log gap"); + } + None + } +} + +impl Apply { + pub fn apply_transfer_leader( + &mut self, + req: &AdminRequest, + term: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.transfer_leader.all.inc(); + let resp = AdminResponse::default(); + + let peer = req.get_transfer_leader().get_peer(); + // Only execute TransferLeader if the expected new leader is self. + if peer.get_id() == self.peer().get_id() { + Ok((resp, AdminCmdResult::TransferLeader(term))) + } else { + Ok((resp, AdminCmdResult::None)) + } + } +} + +impl Peer { + pub fn on_transfer_leader(&mut self, ctx: &mut StoreContext, term: u64) { + // If the term has changed between proposing and executing the TransferLeader + // request, ignore it because this request may be stale. + if term != self.term() { + return; + } + + // Reply to leader that it is ready to transfer leader now. + self.execute_transfer_leader(ctx, self.leader_id(), DiskUsage::Normal, true); + + self.set_has_ready(); + } +} diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs new file mode 100644 index 00000000000..586d9f5c019 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -0,0 +1,443 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::LinkedList, mem}; + +use kvproto::{metapb, raft_cmdpb::AdminCmdType}; +use raftstore::{ + store::{ + cmd_resp, + fsm::apply, + msg::ErrorCallback, + util::{ + admin_cmd_epoch_lookup, AdminCmdEpochState, NORMAL_REQ_CHECK_CONF_VER, + NORMAL_REQ_CHECK_VER, + }, + }, + Error, +}; + +use crate::router::CmdResChannel; + +#[derive(Debug)] +pub struct ProposedAdminCmd { + cmd_type: AdminCmdType, + committed: bool, + epoch_state: AdminCmdEpochState, + index: u64, + /// Callbacks of commands that are conflict with on going admin command. + /// + /// Callbacks are delayed to avoid making client retry with arbitrary + /// backoff. + delayed_chs: Vec, +} + +impl ProposedAdminCmd { + fn new( + cmd_type: AdminCmdType, + epoch_state: AdminCmdEpochState, + index: u64, + ) -> ProposedAdminCmd { + ProposedAdminCmd { + cmd_type, + committed: false, + epoch_state, + index, + delayed_chs: Vec::new(), + } + } + + pub fn cmd_type(&self) -> AdminCmdType { + self.cmd_type + } + + /// Delay responding to channel until the command is applied so client won't + /// retry with arbitrary timeout. + pub fn delay_channel(&mut self, ch: CmdResChannel) { + self.delayed_chs.push(ch); + } + + /// Same as `delay_channel`, but accepts a batch. + pub fn delay_channels(&mut self, chs: Vec) { + if self.delayed_chs.is_empty() { + self.delayed_chs = chs; + } else { + self.delayed_chs.extend(chs); + } + } +} + +/// `ProposalControl` is a rewrite of `CmdEpochChecker` from v1. +/// +/// Admin command may change the epoch of a region. If a proposal is proposed +/// after the admin command is proposed but before the command is applied, the +/// proposal is probably to fail because of epoch not match. `ProposalControl` +/// aims to detect the failure early. With `ProposalControl`, users can assume +/// once a command is proposed, it's likely to succeed in the end. +/// +/// Compared to `CmdEpochChecker`, `ProposalControl` also traces the whole +/// lifetime of prepare merge. +pub struct ProposalControl { + // Admin commands that are proposed but not applied. + // Use `LinkedList` to reduce memory footprint. In most cases, the list + // should be empty or 1 element. And access speed is not a concern. + proposed_admin_cmd: LinkedList, + has_pending_prepare_merge: bool, + applied_prepare_merge_index: u64, + term: u64, +} + +impl ProposalControl { + pub fn new(term: u64) -> ProposalControl { + ProposalControl { + proposed_admin_cmd: LinkedList::new(), + has_pending_prepare_merge: false, + applied_prepare_merge_index: 0, + term, + } + } + + /// Clears all queued conflict callbacks if term changed. + /// + /// If term is changed, leader is probably changed. Clear all callbacks to + /// notify clients to retry with new leader. + #[inline] + pub fn maybe_update_term(&mut self, term: u64) { + match term.cmp(&self.term) { + std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Greater => { + for cmd in mem::take(&mut self.proposed_admin_cmd) { + for cb in cmd.delayed_chs { + apply::notify_stale_req(term, cb); + } + } + self.term = term; + } + std::cmp::Ordering::Less => { + panic!("term should not decrease, old {}, new {}", self.term, term) + } + } + } + + /// Check if a proposal is conflict with proposed admin commands in current + /// term. If the proposal is an admin command, then its type should be + /// passed, otherwise just provide `None`. + /// + /// Returns None if passing the epoch check, otherwise returns the last + /// conflict conflict proposal meta. + pub fn check_conflict( + &mut self, + cmd_type: Option, + ) -> Option<&mut ProposedAdminCmd> { + let (check_ver, check_conf_ver) = match cmd_type { + None => (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER), + Some(ty) => { + let epoch_state = admin_cmd_epoch_lookup(ty); + (epoch_state.check_ver, epoch_state.check_conf_ver) + } + }; + self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { + (check_ver && cmd.epoch_state.change_ver) + || (check_conf_ver && cmd.epoch_state.change_conf_ver) + || cmd.cmd_type == AdminCmdType::PrepareMerge + }) + } + + /// Record an admin proposal. + /// + /// Further requests that is conflict with the admin proposal will be + /// rejected in `check_proposal_conflict`. + pub fn record_proposed_admin(&mut self, cmd_type: AdminCmdType, index: u64) { + let epoch_state = admin_cmd_epoch_lookup(cmd_type); + if !epoch_state.change_conf_ver && !epoch_state.change_ver { + return; + } + + let conflict_cmd = self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { + (epoch_state.check_ver && cmd.epoch_state.change_ver) + || (epoch_state.check_conf_ver && cmd.epoch_state.change_conf_ver) + }); + assert!(conflict_cmd.is_none(), "{:?}", conflict_cmd); + + if let Some(cmd) = self.proposed_admin_cmd.back() { + assert!(cmd.index < index, "{:?} {}", cmd, index); + } + self.proposed_admin_cmd + .push_back(ProposedAdminCmd::new(cmd_type, epoch_state, index)); + } + + /// Commit the admin commands. + #[inline] + pub fn commit_to(&mut self, index: u64, mut on_commit: impl FnMut(&ProposedAdminCmd)) { + if self.proposed_admin_cmd.is_empty() { + return; + } + + for cmd in &mut self.proposed_admin_cmd { + if cmd.committed { + continue; + } + if cmd.index <= index { + cmd.committed = true; + on_commit(cmd); + continue; + } + return; + } + } + + #[inline] + pub fn has_uncommitted_admin(&self) -> bool { + !self.proposed_admin_cmd.is_empty() && !self.proposed_admin_cmd.back().unwrap().committed + } + + pub fn advance_apply(&mut self, index: u64, term: u64, region: &metapb::Region) { + while !self.proposed_admin_cmd.is_empty() { + let cmd = self.proposed_admin_cmd.front_mut().unwrap(); + if cmd.index <= index { + for ch in cmd.delayed_chs.drain(..) { + let mut resp = cmd_resp::new_error(Error::EpochNotMatch( + format!( + "current epoch of region {} is {:?}", + region.get_id(), + region.get_region_epoch(), + ), + vec![region.to_owned()], + )); + cmd_resp::bind_term(&mut resp, term); + ch.report_error(resp); + } + } else { + break; + } + self.proposed_admin_cmd.pop_front(); + } + } + + #[inline] + pub fn set_pending_prepare_merge(&mut self, v: bool) { + self.has_pending_prepare_merge = v; + } + + #[inline] + pub fn has_pending_prepare_merge(&self) -> bool { + self.has_pending_prepare_merge + } + + #[inline] + pub fn enter_prepare_merge(&mut self, prepare_merge_index: u64) { + self.applied_prepare_merge_index = prepare_merge_index; + } + + #[inline] + pub fn leave_prepare_merge(&mut self, prepare_merge_index: u64) { + if self.applied_prepare_merge_index != 0 { + assert_eq!(self.applied_prepare_merge_index, prepare_merge_index); + self.applied_prepare_merge_index = 0; + } + } + + #[inline] + pub fn has_applied_prepare_merge(&self) -> bool { + self.applied_prepare_merge_index != 0 + } + + /// Check if there is an on-going split command on current term. + /// + /// The answer is reliable only when the peer is leader. + #[inline] + pub fn is_splitting(&self) -> bool { + if self.proposed_admin_cmd.is_empty() { + return false; + } + // Split is deprecated in v2, only needs to check `BatchSplit`. + self.proposed_admin_cmd + .iter() + .any(|c| c.cmd_type == AdminCmdType::BatchSplit && c.committed) + } + + /// Check if there the current peer is waiting for being merged. + /// + /// The answer is reliable only when the peer is leader or `PrepareMerge` is + /// applied. + #[inline] + pub fn is_merging(&self) -> bool { + if self.applied_prepare_merge_index != 0 { + return true; + } + self.proposed_admin_cmd + .iter() + .any(|c| c.cmd_type == AdminCmdType::PrepareMerge && c.committed) + } +} + +impl Drop for ProposalControl { + fn drop(&mut self) { + for state in mem::take(&mut self.proposed_admin_cmd) { + for ch in state.delayed_chs { + apply::notify_stale_req(self.term, ch); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_proposal_control() { + let region = metapb::Region::default(); + + let mut control = ProposalControl::new(10); + assert_eq!(control.term, 10); + assert!( + control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .is_none() + ); + control.record_proposed_admin(AdminCmdType::BatchSplit, 5); + assert_eq!(control.proposed_admin_cmd.len(), 1); + + // Both conflict with the split admin cmd + let conflict = control.check_conflict(None).unwrap(); + assert_eq!(conflict.index, 5); + assert_eq!(conflict.cmd_type, AdminCmdType::BatchSplit); + let conflict = control + .check_conflict(Some(AdminCmdType::PrepareMerge)) + .unwrap(); + assert_eq!(conflict.index, 5); + + assert!( + control + .check_conflict(Some(AdminCmdType::ChangePeerV2)) + .is_none() + ); + control.record_proposed_admin(AdminCmdType::ChangePeerV2, 6); + assert_eq!(control.proposed_admin_cmd.len(), 2); + + assert!(!control.is_splitting()); + assert!(!control.is_merging()); + + // Conflict with the change peer admin cmd + let conflict = control + .check_conflict(Some(AdminCmdType::ChangePeerV2)) + .unwrap(); + assert_eq!(conflict.index, 6); + // Conflict with the split admin cmd + let conflict = control.check_conflict(None).unwrap(); + assert_eq!(conflict.index, 5); + // Conflict with the change peer admin cmd + let conflict = control + .check_conflict(Some(AdminCmdType::PrepareMerge)) + .unwrap(); + assert_eq!(conflict.index, 6); + + let mut commit_split = false; + control.commit_to(4, |c| commit_split = c.cmd_type == AdminCmdType::BatchSplit); + assert!(!commit_split); + assert!(!control.is_splitting()); + control.commit_to(5, |c| commit_split = c.cmd_type == AdminCmdType::BatchSplit); + assert!(commit_split); + assert!(control.is_splitting()); + + control.advance_apply(4, 10, ®ion); + // Have no effect on `proposed_admin_cmd` + assert_eq!(control.proposed_admin_cmd.len(), 2); + assert!(control.is_splitting()); + + control.advance_apply(5, 10, ®ion); + // Left one change peer admin cmd + assert_eq!(control.proposed_admin_cmd.len(), 1); + assert!(!control.is_splitting()); + + assert!(control.check_conflict(None).is_none()); + let conflict = control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .unwrap(); + assert_eq!(conflict.index, 6); + + // Change term to 11 + control.maybe_update_term(11); + assert!( + control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .is_none() + ); + assert_eq!(control.term, 11); + // Should be empty + assert_eq!(control.proposed_admin_cmd.len(), 0); + + // Test attaching multiple callbacks. + control.record_proposed_admin(AdminCmdType::BatchSplit, 7); + let mut subs = vec![]; + for _ in 0..3 { + let conflict = control.check_conflict(None).unwrap(); + let (ch, sub) = CmdResChannel::pair(); + conflict.delay_channel(ch); + subs.push(sub); + } + // Delayed channel should not be notified immediately. + for sub in &subs { + assert!(!sub.has_result()); + } + control.advance_apply(7, 12, ®ion); + for sub in subs { + assert!(sub.has_result()); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_epoch_not_match(), + "{:?}", + res + ); + } + + // Should invoke callbacks when term is increased. + control.record_proposed_admin(AdminCmdType::BatchSplit, 8); + let (ch, sub) = CmdResChannel::pair(); + control.check_conflict(None).unwrap().delay_channel(ch); + control.maybe_update_term(13); + assert!(control.check_conflict(None).is_none()); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_stale_command(), + "{:?}", + res + ); + + // Should invoke callbacks when it's dropped. + control.record_proposed_admin(AdminCmdType::BatchSplit, 9); + let (ch, sub) = CmdResChannel::pair(); + control.check_conflict(None).unwrap().delay_channel(ch); + drop(control); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_stale_command(), + "{:?}", + res + ); + } + + #[test] + fn test_proposal_control_merge() { + let region = metapb::Region::default(); + + let mut control = ProposalControl::new(5); + assert!(!control.is_merging()); + control.record_proposed_admin(AdminCmdType::PrepareMerge, 5); + assert!(!control.is_merging()); + control.commit_to(5, |_| ()); + assert!(control.is_merging()); + control.advance_apply(5, 5, ®ion); + assert!(!control.is_merging()); + + control.record_proposed_admin(AdminCmdType::PrepareMerge, 6); + assert!(!control.is_merging()); + control.commit_to(6, |_| ()); + assert!(control.is_merging()); + control.enter_prepare_merge(6); + control.advance_apply(6, 5, ®ion); + assert!(control.is_merging()); + control.leave_prepare_merge(6); + assert!(!control.is_merging()); + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs new file mode 100644 index 00000000000..b9256f031fe --- /dev/null +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -0,0 +1,810 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains implementations of commmands that will be replicated to +//! all replicas and executed in the same order. Typical commands include: +//! - normal writes like put, delete, etc. +//! - admin commands like split, compact, etc. +//! +//! General proceessing is: +//! - Propose a command to the leader via PeerMsg::Command, +//! - The leader batch up commands and replicates them to followers, +//! - Once they are replicated to majority, leader considers it committed and +//! send to another thread for execution via +//! `schedule_apply_committed_entries`, +//! - The apply thread executes the commands in buffer, and write to LSM tree +//! via `flush`, +//! - Applied result are sent back to peer fsm, and update memory state in +//! `on_apply_res`. + +use std::{ + mem, + sync::{atomic::Ordering, Arc}, + time::Duration, +}; + +use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions}; +use kvproto::raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, +}; +use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; +use raft_proto::ConfChangeI; +use raftstore::{ + coprocessor::ObserveLevel, + store::{ + cmd_resp, + fsm::{ + apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, + Proposal, + }, + local_metrics::RaftMetrics, + metrics::{ + APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM, STORE_APPLY_LOG_HISTOGRAM, + }, + msg::ErrorCallback, + util, Config, Transport, WriteCallback, + }, + Error, Result, +}; +use slog::{debug, error, warn}; +use tikv_util::{ + box_err, + log::SlogFormat, + slog_panic, + time::{duration_to_sec, monotonic_raw_now, Instant}, +}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyFsm, ApplyResReporter}, + raft::{Apply, Peer}, + router::{ApplyRes, ApplyTask, CmdResChannel}, +}; + +mod admin; +mod control; +mod write; + +pub use admin::{ + report_split_init_finish, temp_split_path, AdminCmdResult, CatchUpLogs, CompactLogContext, + MergeContext, RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, + MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, +}; +pub use control::ProposalControl; +use pd_client::{BucketMeta, BucketStat}; +use protobuf::Message; +pub use write::{SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder}; +pub type SimpleWriteReqEncoder = + raftstore::store::simple_write::SimpleWriteReqEncoder; + +use self::write::SimpleWrite; + +pub(crate) fn parse_at( + logger: &slog::Logger, + buf: &[u8], + index: u64, + term: u64, +) -> M { + let mut m = M::default(); + match m.merge_from_bytes(buf) { + Ok(()) => m, + Err(e) => slog_panic!( + logger, + "data is corrupted"; + "term" => term, + "index" => index, + "error" => ?e, + ), + } +} + +#[derive(Debug)] +pub struct CommittedEntries { + /// Entries need to be applied. Note some entries may not be included for + /// flow control. + pub entry_and_proposals: Vec<(Entry, Vec)>, +} + +fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { + let mut resp = RaftCmdResponse::default(); + if !header.get_uuid().is_empty() { + let uuid = header.get_uuid().to_vec(); + resp.mut_header().set_uuid(uuid); + } + resp +} + +impl Peer { + /// Schedule an apply fsm to apply logs in the background. + /// + /// Everytime a snapshot is applied or peer is just started, it will + /// schedule a new apply fsm. The old fsm will stopped automatically + /// when the old apply scheduler is dropped. + #[inline] + pub fn schedule_apply_fsm(&mut self, store_ctx: &mut StoreContext) { + let region_state = self.storage().region_state().clone(); + let mailbox = match store_ctx.router.mailbox(self.region_id()) { + Some(m) => m, + None => { + assert!( + store_ctx.shutdown.load(Ordering::Relaxed), + "failed to load mailbox: {}", + SlogFormat(&self.logger) + ); + return; + } + }; + let logger = self.logger.clone(); + let read_scheduler = self.storage().read_scheduler(); + let buckets = self.region_buckets_info().bucket_stat().clone(); + let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( + &store_ctx.cfg, + self.peer().clone(), + region_state, + mailbox, + store_ctx.tablet_registry.clone(), + read_scheduler, + self.flush_state().clone(), + self.storage().apply_trace().log_recovery(), + self.entry_storage().applied_term(), + buckets, + store_ctx.sst_importer.clone(), + store_ctx.coprocessor_host.clone(), + logger, + ); + + store_ctx + .apply_pool + .spawn(async move { apply_fsm.handle_all_tasks().await }) + .unwrap(); + fail::fail_point!("delay_set_apply_scheduler", |_| {}); + self.set_apply_scheduler(apply_scheduler); + } + + #[inline] + fn validate_command( + &self, + header: &RaftRequestHeader, + admin_type: Option, + metrics: &mut RaftMetrics, + ) -> Result<()> { + if let Err(e) = util::check_store_id(header, self.peer().get_store_id()) { + metrics.invalid_proposal.mismatch_store_id.inc(); + return Err(e); + } + if let Err(e) = util::check_peer_id(header, self.peer().get_id()) { + metrics.invalid_proposal.mismatch_peer_id.inc(); + return Err(e); + } + if !self.is_leader() { + metrics.invalid_proposal.not_leader.inc(); + return Err(Error::NotLeader(self.region_id(), self.leader())); + } + if let Err(e) = util::check_term(header, self.term()) { + metrics.invalid_proposal.stale_command.inc(); + return Err(e); + } + if let Err(mut e) = util::check_region_epoch(header, admin_type, self.region(), true) { + if let Error::EpochNotMatch(_, _new_regions) = &mut e { + // TODO: query sibling regions. + metrics.invalid_proposal.epoch_not_match.inc(); + } + return Err(e); + } + Ok(()) + } + + #[inline] + fn propose( + &mut self, + store_ctx: &mut StoreContext, + data: Vec, + ) -> Result { + self.propose_with_ctx(store_ctx, data, vec![]) + } + + #[inline] + fn propose_with_ctx( + &mut self, + store_ctx: &mut StoreContext, + data: Vec, + proposal_ctx: Vec, + ) -> Result { + store_ctx.raft_metrics.propose.normal.inc(); + store_ctx + .raft_metrics + .propose_log_size + .observe(data.len() as f64); + if data.len() as u64 > store_ctx.cfg.raft_entry_max_size.0 { + return Err(Error::RaftEntryTooLarge { + region_id: self.region_id(), + entry_size: data.len() as u64, + }); + } + let last_index = self.raft_group().raft.raft_log.last_index(); + self.raft_group_mut().propose(proposal_ctx, data)?; + if self.raft_group().raft.raft_log.last_index() == last_index { + // The message is dropped silently, this usually due to leader absence + // or transferring leader. Both cases can be considered as NotLeader error. + return Err(Error::NotLeader(self.region_id(), None)); + } + Ok(last_index + 1) + } + + #[inline] + pub fn post_propose_command( + &mut self, + ctx: &mut StoreContext, + res: Result, + ch: Vec, + call_proposed_on_success: bool, + ) { + let idx = match res { + Ok(i) => i, + Err(e) => { + ch.report_error(cmd_resp::err_resp(e, self.term())); + return; + } + }; + let mut proposal = Proposal::new(idx, self.term(), ch); + if call_proposed_on_success { + proposal.cb.notify_proposed(); + } + proposal.must_pass_epoch_check = self.applied_to_current_term(); + proposal.propose_time = Some(*ctx.current_time.get_or_insert_with(monotonic_raw_now)); + self.report_batch_wait_duration(ctx, &proposal.cb); + self.proposals_mut().push(proposal); + self.set_has_ready(); + } + + fn report_batch_wait_duration( + &self, + ctx: &mut StoreContext, + ch: &Vec, + ) { + if !ctx.raft_metrics.waterfall_metrics || ch.is_empty() { + return; + } + let now = std::time::Instant::now(); + for c in ch { + for tracker in c.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_batch_wait, |t| { + &mut t.metrics.wf_batch_wait_nanos + }); + } + } + } + + #[inline] + pub fn schedule_apply_committed_entries( + &mut self, + ctx: &mut StoreContext, + committed_entries: Vec, + ) { + if committed_entries.is_empty() { + return; + } + let current_term = self.term(); + let mut entry_and_proposals = vec![]; + let queue = self.proposals_mut(); + if !queue.is_empty() { + for e in committed_entries { + let mut proposal = queue.find_proposal(e.term, e.index, current_term); + if let Some(p) = &mut proposal && p.must_pass_epoch_check { + // In this case the apply can be guaranteed to be successful. Invoke the + // on_committed callback if necessary. + p.cb.notify_committed(); + } + entry_and_proposals.push((e, proposal.map_or_else(Vec::new, |p| p.cb))); + } + } else { + entry_and_proposals = committed_entries.into_iter().map(|e| (e, vec![])).collect(); + } + self.report_store_time_duration(ctx, &mut entry_and_proposals); + // Unlike v1, v2 doesn't need to persist commit index and commit term. The + // point of persist commit index/term of raft apply state is to recover commit + // index when the writes to raft engine is lost but writes to kv engine is + // persisted. But in v2, writes to raft engine must be persisted before + // memtables in kv engine is flushed. + let apply = CommittedEntries { + entry_and_proposals, + }; + assert!( + self.apply_scheduler().is_some() || ctx.router.is_shutdown(), + "{} apply_scheduler should not be None", + SlogFormat(&self.logger) + ); + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::CommittedEntries(apply)); + } + } + + #[inline] + fn report_store_time_duration( + &mut self, + ctx: &mut StoreContext, + entry_and_proposals: &mut [(Entry, Vec)], + ) { + let now = std::time::Instant::now(); + for (_, chs) in entry_and_proposals { + for tracker in chs.write_trackers_mut() { + tracker.observe(now, &ctx.raft_metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + tracker.reset(now); + } + } + } + + pub fn on_apply_res( + &mut self, + ctx: &mut StoreContext, + apply_res: ApplyRes, + ) { + if !self.serving() || !apply_res.admin_result.is_empty() { + // TODO: remove following log once stable. + debug!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); + } + // It must just applied a snapshot. + if apply_res.applied_index < self.entry_storage().first_index() { + // Ignore admin command side effects, otherwise it may split incomplete + // region. + return; + } + + for admin_res in Vec::from(apply_res.admin_result) { + match admin_res { + AdminCmdResult::None => unreachable!(), + AdminCmdResult::ConfChange(conf_change) => { + self.on_apply_res_conf_change(ctx, conf_change) + } + AdminCmdResult::SplitRegion(res) => { + self.storage_mut() + .apply_trace_mut() + .on_admin_modify(res.tablet_index); + self.on_apply_res_split(ctx, res) + } + AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), + AdminCmdResult::CompactLog(res) => self.on_apply_res_compact_log(ctx, res), + AdminCmdResult::UpdateGcPeers(state) => self.on_apply_res_update_gc_peers(state), + AdminCmdResult::PrepareMerge(res) => self.on_apply_res_prepare_merge(ctx, res), + AdminCmdResult::CommitMerge(res) => self.on_apply_res_commit_merge(ctx, res), + } + } + self.region_buckets_info_mut() + .add_bucket_flow(&apply_res.bucket_stat); + self.update_split_flow_control(&apply_res.metrics); + self.update_stat(&apply_res.metrics); + ctx.store_stat.engine_total_bytes_written += apply_res.metrics.written_bytes; + ctx.store_stat.engine_total_keys_written += apply_res.metrics.written_keys; + + self.raft_group_mut() + .advance_apply_to(apply_res.applied_index); + self.proposal_control_advance_apply(apply_res.applied_index); + let is_leader = self.is_leader(); + let progress_to_be_updated = self.entry_storage().applied_term() != apply_res.applied_term; + let entry_storage = self.entry_storage_mut(); + entry_storage + .apply_state_mut() + .set_applied_index(apply_res.applied_index); + entry_storage.set_applied_term(apply_res.applied_term); + if !is_leader { + entry_storage.compact_entry_cache(apply_res.applied_index + 1); + } + if is_leader { + self.retry_pending_prepare_merge(ctx, apply_res.applied_index); + } + self.on_data_modified(apply_res.modifications); + self.handle_read_on_apply( + ctx, + apply_res.applied_term, + apply_res.applied_index, + progress_to_be_updated, + ); + self.try_compelete_recovery(); + if !self.pause_for_recovery() && self.storage_mut().apply_trace_mut().should_flush() { + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + } + let last_applying_index = self.compact_log_context().last_applying_index(); + let committed_index = self.entry_storage().commit_index(); + if last_applying_index < committed_index || !self.serving() { + // We need to continue to apply after previous page is finished. + self.set_has_ready(); + } + } +} + +#[derive(Debug)] +pub struct ApplyFlowControl { + timer: Instant, + last_check_keys: u64, + need_flush: bool, + yield_time: Duration, + yield_written_bytes: u64, +} + +impl ApplyFlowControl { + pub fn new(cfg: &Config) -> Self { + ApplyFlowControl { + timer: Instant::now_coarse(), + last_check_keys: 0, + need_flush: false, + yield_time: cfg.apply_yield_duration.0, + yield_written_bytes: cfg.apply_yield_write_size.0, + } + } + + #[cfg(test)] + pub fn set_need_flush(&mut self, need_flush: bool) { + self.need_flush = need_flush; + } +} + +impl Apply { + #[inline] + pub fn on_start_apply(&mut self) { + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + + #[inline] + fn should_skip(&self, off: usize, index: u64) -> bool { + let log_recovery = self.log_recovery(); + if log_recovery.is_none() { + return false; + } + log_recovery.as_ref().unwrap()[off] >= index + } +} + +impl Apply { + pub fn apply_unsafe_write(&mut self, data: Box<[u8]>) { + let decoder = match SimpleWriteReqDecoder::new( + |buf, index, term| parse_at(&self.logger, buf, index, term), + &self.logger, + &data, + u64::MAX, + u64::MAX, + ) { + Ok(decoder) => decoder, + Err(req) => unreachable!("unexpected request: {:?}", req), + }; + for req in decoder { + match req { + SimpleWrite::Put(put) => { + let _ = self.apply_put(put.cf, u64::MAX, put.key, put.value); + } + SimpleWrite::Delete(delete) => { + let _ = self.apply_delete(delete.cf, u64::MAX, delete.key); + } + SimpleWrite::DeleteRange(dr) => { + let _ = self.apply_delete_range( + dr.cf, + u64::MAX, + dr.start_key, + dr.end_key, + dr.notify_only, + ); + } + SimpleWrite::Ingest(_) => { + error!( + self.logger, + "IngestSST is not supposed to be called on local engine" + ); + } + } + } + self.apply_flow_control_mut().need_flush = true; + } + + pub async fn on_manual_flush(&mut self) { + let written_bytes = self.flush(); + if let Err(e) = self.tablet().flush_cfs(&[], false) { + warn!(self.logger, "failed to flush: {:?}", e); + } + self.maybe_reschedule(written_bytes).await + } + + pub fn on_refresh_buckets(&mut self, meta: Arc) { + let mut new = BucketStat::from_meta(meta); + if let Some(origin) = self.buckets.as_ref() { + new.merge(origin); + } + self.buckets.replace(new); + } + + #[inline] + pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { + fail::fail_point!("APPLY_COMMITTED_ENTRIES"); + let now = std::time::Instant::now(); + let apply_wait_time = APPLY_TASK_WAIT_TIME_HISTOGRAM.local(); + for (e, ch) in ce.entry_and_proposals { + if self.tombstone() { + apply::notify_req_region_removed(self.region_id(), ch); + continue; + } + if !e.get_data().is_empty() { + for tracker in ch.write_trackers() { + tracker.observe(now, &apply_wait_time, |t| &mut t.metrics.apply_wait_nanos); + } + let mut set_save_point = false; + if let Some(wb) = &mut self.write_batch { + wb.set_save_point(); + set_save_point = true; + } + let (req, resp) = match self.apply_entry(&e).await { + Ok(req_resp) => req_resp, + Err(e) => { + if let Some(wb) = &mut self.write_batch { + if set_save_point { + wb.rollback_to_save_point().unwrap(); + } else { + wb.clear(); + } + } + (RaftCmdRequest::default(), cmd_resp::new_error(e)) + } + }; + self.observe_apply(e.get_index(), e.get_term(), req, &resp); + self.callbacks_mut().push((ch, resp)); + } else { + assert!(ch.is_empty()); + } + // Flush may be triggerred in the middle, so always update the index and term. + self.set_apply_progress(e.index, e.term); + self.apply_flow_control_mut().need_flush = true; + } + } + + #[inline] + async fn apply_entry(&mut self, entry: &Entry) -> Result<(RaftCmdRequest, RaftCmdResponse)> { + let mut conf_change = None; + let log_index = entry.get_index(); + let req = match entry.get_entry_type() { + EntryType::EntryNormal => match SimpleWriteReqDecoder::new( + |buf, index, term| parse_at(&self.logger, buf, index, term), + &self.logger, + entry.get_data(), + log_index, + entry.get_term(), + ) { + Ok(decoder) => { + util::compare_region_epoch( + decoder.header().get_region_epoch(), + self.region(), + false, + true, + true, + )?; + let mut req = RaftCmdRequest::default(); + if self.observe().level != ObserveLevel::None { + req = decoder.to_raft_cmd_request(); + } + let resp = new_response(decoder.header()); + for req in decoder { + match req { + SimpleWrite::Put(put) => { + self.apply_put(put.cf, log_index, put.key, put.value)?; + } + SimpleWrite::Delete(delete) => { + self.apply_delete(delete.cf, log_index, delete.key)?; + } + SimpleWrite::DeleteRange(dr) => { + self.apply_delete_range( + dr.cf, + log_index, + dr.start_key, + dr.end_key, + dr.notify_only, + )?; + } + SimpleWrite::Ingest(ssts) => { + self.apply_ingest(log_index, ssts)?; + } + } + } + return Ok((req, resp)); + } + Err(req) => req, + }, + EntryType::EntryConfChange => { + let cc: ConfChange = + parse_at(&self.logger, entry.get_data(), log_index, entry.get_term()); + let req: RaftCmdRequest = + parse_at(&self.logger, cc.get_context(), log_index, entry.get_term()); + conf_change = Some(cc.into_v2()); + req + } + EntryType::EntryConfChangeV2 => { + let cc: ConfChangeV2 = + parse_at(&self.logger, entry.get_data(), log_index, entry.get_term()); + let req: RaftCmdRequest = + parse_at(&self.logger, cc.get_context(), log_index, entry.get_term()); + conf_change = Some(cc); + req + } + }; + + util::check_req_region_epoch(&req, self.region(), true)?; + if req.has_admin_request() { + let admin_req = req.get_admin_request(); + let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { + AdminCmdType::CompactLog => self.apply_compact_log(admin_req, log_index)?, + AdminCmdType::Split => self.apply_split(admin_req, log_index)?, + AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, + AdminCmdType::PrepareMerge => self.apply_prepare_merge(admin_req, log_index)?, + AdminCmdType::CommitMerge => self.apply_commit_merge(admin_req, log_index).await?, + AdminCmdType::RollbackMerge => unimplemented!(), + AdminCmdType::TransferLeader => { + self.apply_transfer_leader(admin_req, entry.term)? + } + AdminCmdType::ChangePeer => { + self.apply_conf_change(log_index, admin_req, conf_change.unwrap())? + } + AdminCmdType::ChangePeerV2 => { + self.apply_conf_change_v2(log_index, admin_req, conf_change.unwrap())? + } + AdminCmdType::ComputeHash => unimplemented!(), + AdminCmdType::VerifyHash => unimplemented!(), + AdminCmdType::PrepareFlashback => unimplemented!(), + AdminCmdType::FinishFlashback => unimplemented!(), + AdminCmdType::BatchSwitchWitness => unimplemented!(), + AdminCmdType::UpdateGcPeer => self.apply_update_gc_peer(log_index, admin_req), + AdminCmdType::InvalidAdmin => { + return Err(box_err!("invalid admin command type")); + } + }; + + match admin_result { + AdminCmdResult::None => (), + _ => self.push_admin_result(admin_result), + } + let mut resp = new_response(req.get_header()); + resp.set_admin_response(admin_resp); + Ok((req, resp)) + } else { + for r in req.get_requests() { + match r.get_cmd_type() { + // These three writes should all use the new codec. Keep them here for + // backward compatibility. + CmdType::Put => { + let put = r.get_put(); + self.apply_put(put.get_cf(), log_index, put.get_key(), put.get_value())?; + } + CmdType::Delete => { + let delete = r.get_delete(); + self.apply_delete(delete.get_cf(), log_index, delete.get_key())?; + } + CmdType::DeleteRange => { + let dr = r.get_delete_range(); + self.apply_delete_range( + dr.get_cf(), + log_index, + dr.get_start_key(), + dr.get_end_key(), + dr.get_notify_only(), + )?; + } + _ => unimplemented!(), + } + } + let resp = new_response(req.get_header()); + Ok((req, resp)) + } + } + + fn should_reschedule(&self, written_bytes: u64) -> bool { + let control = self.apply_flow_control(); + written_bytes >= control.yield_written_bytes + || control.timer.saturating_elapsed() >= control.yield_time + } + + pub async fn maybe_reschedule(&mut self, written_bytes: u64) { + if self.should_reschedule(written_bytes) { + yatp::task::future::reschedule().await; + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + } + + /// Check whether it needs to flush. + /// + /// We always batch as much inputs as possible, flush will only be triggered + /// when it has been processing too long. + pub async fn maybe_flush(&mut self) { + let buffer_keys = self.metrics.written_keys; + let control = self.apply_flow_control_mut(); + if buffer_keys >= control.last_check_keys + 128 { + // Reschedule by write size was designed to avoid too many deletes impacts + // performance so it doesn't need pricise control. If checking bytes here may + // make the batch too small and hurt performance. + if self.should_reschedule(0) { + let written_bytes = self.flush(); + self.maybe_reschedule(written_bytes).await; + } else { + self.apply_flow_control_mut().last_check_keys = self.metrics.written_keys; + } + } + } + + #[inline] + pub fn flush(&mut self) -> u64 { + // TODO: maybe we should check whether there is anything to flush. + let (index, term) = self.apply_progress(); + let control = self.apply_flow_control_mut(); + control.last_check_keys = 0; + if !control.need_flush { + return 0; + } + control.need_flush = false; + let flush_state = self.flush_state().clone(); + if let Some(wb) = &self.write_batch && !wb.is_empty() { + self.perf_context().start_observe(); + let mut write_opt = WriteOptions::default(); + write_opt.set_disable_wal(true); + let wb = self.write_batch.as_mut().unwrap(); + if let Err(e) = wb.write_callback_opt(&write_opt, || { + flush_state.set_applied_index(index); + }) { + slog_panic!(self.logger, "failed to write data"; "error" => ?e); + } + self.metrics.written_bytes += wb.data_size() as u64; + self.metrics.written_keys += wb.count() as u64; + if wb.data_size() <= APPLY_WB_SHRINK_SIZE { + wb.clear(); + } else { + self.write_batch.take(); + } + let tokens: Vec<_> = self + .callbacks_mut() + .iter() + .flat_map(|(v, _)| { + v.write_trackers() + .flat_map(|t| t.as_tracker_token()) + }) + .collect(); + self.perf_context().report_metrics(&tokens); + } + let mut apply_res = ApplyRes::default(); + apply_res.applied_index = index; + apply_res.applied_term = term; + apply_res.admin_result = self.take_admin_result().into_boxed_slice(); + apply_res.modifications = *self.modifications_mut(); + apply_res.metrics = mem::take(&mut self.metrics); + apply_res.bucket_stat = self.buckets.clone(); + let written_bytes = apply_res.metrics.written_bytes; + self.res_reporter().report(apply_res); + if let Some(buckets) = &mut self.buckets { + buckets.clear_stats(); + } + + // Call it before invoking callback for preventing Commit is executed before + // Prewrite is observed. + self.flush_observed_apply(); + + // Report result first and then invoking callbacks. This may delays callback a + // little bit, but can make sure all following messages must see the side + // effect of admin commands. + let callbacks = self.callbacks_mut(); + let now = std::time::Instant::now(); + let apply_time = APPLY_TIME_HISTOGRAM.local(); + for (ch, resp) in callbacks.drain(..) { + for tracker in ch.write_trackers() { + let mut apply_wait_nanos = 0_u64; + let apply_time_nanos = tracker.observe(now, &apply_time, |t| { + apply_wait_nanos = t.metrics.apply_wait_nanos; + &mut t.metrics.apply_time_nanos + }); + STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(Duration::from_nanos( + apply_time_nanos - apply_wait_nanos, + ))); + } + ch.set_result(resp); + } + apply_time.flush(); + if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { + callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); + } + written_bytes + } +} diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs new file mode 100644 index 00000000000..bc15765437f --- /dev/null +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -0,0 +1,121 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use collections::HashMap; +use crossbeam::channel::TrySendError; +use engine_traits::{data_cf_offset, KvEngine, RaftEngine}; +use kvproto::import_sstpb::SstMeta; +use raftstore::{ + store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, + Result, +}; +use slog::error; +use tikv_util::{box_try, slog_panic}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, Store, StoreFsmDelegate}, + raft::{Apply, Peer}, + router::{PeerMsg, StoreTick}, + worker::tablet, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_cleanup_import_sst(&mut self) { + if let Err(e) = self.fsm.store.on_cleanup_import_sst(self.store_ctx) { + error!(self.fsm.store.logger(), "cleanup import sst failed"; "error" => ?e); + } + self.schedule_tick( + StoreTick::CleanupImportSst, + self.store_ctx.cfg.cleanup_import_sst_interval.0, + ); + } +} + +impl Store { + #[inline] + fn on_cleanup_import_sst( + &mut self, + ctx: &mut StoreContext, + ) -> Result<()> { + let ssts = box_try!(ctx.sst_importer.list_ssts()); + if ssts.is_empty() { + return Ok(()); + } + let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); + for sst in ssts { + region_ssts + .entry(sst.get_region_id()) + .or_default() + .push(sst); + } + for (region_id, ssts) in region_ssts { + if let Err(TrySendError::Disconnected(msg)) = ctx.router.send(region_id, PeerMsg::CleanupImportSst(ssts.into())) + && !ctx.router.is_shutdown() { + let PeerMsg::CleanupImportSst(ssts) = msg else { unreachable!() }; + let _ = ctx.schedulers.tablet.schedule(tablet::Task::CleanupImportSst(ssts)); + } + } + + Ok(()) + } +} + +impl Peer { + pub fn on_cleanup_import_sst( + &mut self, + ctx: &mut StoreContext, + ssts: Box<[SstMeta]>, + ) { + let epoch = self.region().get_region_epoch(); + let mut stale_ssts = Vec::from(ssts); + stale_ssts.retain(|sst| util::is_epoch_stale(sst.get_region_epoch(), epoch)); + if stale_ssts.is_empty() { + return; + } + let _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::CleanupImportSst(stale_ssts.into())); + } +} + +impl Apply { + #[inline] + pub fn apply_ingest(&mut self, index: u64, ssts: Vec) -> Result<()> { + PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); + let mut infos = Vec::with_capacity(ssts.len()); + for sst in &ssts { + // This may not be enough as ingest sst may not trigger flush at all. + let off = data_cf_offset(sst.get_cf_name()); + if self.should_skip(off, index) { + continue; + } + if let Err(e) = check_sst_for_ingestion(sst, self.region()) { + error!( + self.logger, + "ingest fail"; + "sst" => ?sst, + "region" => ?self.region(), + "error" => ?e + ); + let _ = self.sst_importer().delete(sst); + return Err(e); + } + match self.sst_importer().validate(sst) { + Ok(meta_info) => infos.push(meta_info), + Err(e) => { + slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); + } + } + } + if !infos.is_empty() { + // Unlike v1, we can't batch ssts accross regions. + self.flush(); + if let Err(e) = self.sst_importer().ingest(&infos, self.tablet()) { + slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); + } + } + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs new file mode 100644 index 00000000000..9f4afec9ad6 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -0,0 +1,234 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{data_cf_offset, KvEngine, Mutable, RaftEngine, CF_DEFAULT}; +use kvproto::raft_cmdpb::RaftRequestHeader; +use raftstore::{ + store::{ + cmd_resp, + fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, + metrics::PEER_WRITE_CMD_COUNTER, + msg::ErrorCallback, + util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, + }, + Error, Result, +}; +use tikv_util::slog_panic; + +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::SimpleWriteReqEncoder, + raft::{Apply, Peer}, + router::{ApplyTask, CmdResChannel}, +}; + +mod ingest; + +pub use raftstore::store::simple_write::{ + SimpleWrite, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, +}; + +impl Peer { + #[inline] + pub fn on_simple_write( + &mut self, + ctx: &mut StoreContext, + header: Box, + data: SimpleWriteBinary, + ch: CmdResChannel, + ) { + if !self.serving() { + apply::notify_req_region_removed(self.region_id(), ch); + return; + } + if let Some(encoder) = self.simple_write_encoder_mut() { + if encoder.amend(&header, &data) { + encoder.add_response_channel(ch); + self.set_has_ready(); + return; + } + } + if let Err(e) = self.validate_command(&header, None, &mut ctx.raft_metrics) { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + // To maintain propose order, we need to make pending proposal first. + self.propose_pending_writes(ctx); + if let Some(conflict) = self.proposal_control_mut().check_conflict(None) { + conflict.delay_channel(ch); + return; + } + if self.proposal_control().has_pending_prepare_merge() + || self.proposal_control().is_merging() + { + let resp = cmd_resp::new_error(Error::ProposalInMergingMode(self.region_id())); + ch.report_error(resp); + return; + } + // ProposalControl is reliable only when applied to current term. + let call_proposed_on_success = self.applied_to_current_term(); + let mut encoder = SimpleWriteReqEncoder::new( + header, + data, + (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, + call_proposed_on_success, + ); + encoder.add_response_channel(ch); + self.set_has_ready(); + self.simple_write_encoder_mut().replace(encoder); + } + + #[inline] + pub fn on_unsafe_write( + &mut self, + ctx: &mut StoreContext, + data: SimpleWriteBinary, + ) { + if !self.serving() { + return; + } + let bin = SimpleWriteReqEncoder::new( + Box::::default(), + data, + ctx.cfg.raft_entry_max_size.0 as usize, + false, + ) + .encode() + .0 + .into_boxed_slice(); + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::UnsafeWrite(bin)); + } + } + + pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { + if let Some(encoder) = self.simple_write_encoder_mut().take() { + let call_proposed_on_success = if encoder.notify_proposed() { + // The request has pass conflict check and called all proposed callbacks. + false + } else { + // Epoch may have changed since last check. + let from_epoch = encoder.header().get_region_epoch(); + let res = util::compare_region_epoch( + from_epoch, + self.region(), + NORMAL_REQ_CHECK_CONF_VER, + NORMAL_REQ_CHECK_VER, + true, + ); + if let Err(e) = res { + // TODO: query sibling regions. + ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); + encoder.encode().1.report_error(cmd_resp::new_error(e)); + return; + } + // Only when it applies to current term, the epoch check can be reliable. + self.applied_to_current_term() + }; + let (data, chs) = encoder.encode(); + let res = self.propose(ctx, data); + self.post_propose_command(ctx, res, chs, call_proposed_on_success); + } + } +} + +impl Apply { + #[inline] + pub fn apply_put(&mut self, cf: &str, index: u64, key: &[u8], value: &[u8]) -> Result<()> { + PEER_WRITE_CMD_COUNTER.put.inc(); + let off = data_cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } + util::check_key_in_region(key, self.region())?; + if let Some(s) = self.buckets.as_mut() { + s.write_key(key, value.len() as u64); + } + // Technically it's OK to remove prefix for raftstore v2. But rocksdb doesn't + // support specifying infinite upper bound in various APIs. + keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); + let res = if cf.is_empty() || cf == CF_DEFAULT { + // TODO: use write_vector + self.write_batch + .as_mut() + .unwrap() + .put(&self.key_buffer, value) + } else { + self.write_batch + .as_mut() + .unwrap() + .put_cf(cf, &self.key_buffer, value) + }; + res.unwrap_or_else(|e| { + slog_panic!( + self.logger, + "failed to write"; + "key" => %log_wrappers::Value::key(key), + "value" => %log_wrappers::Value::value(value), + "cf" => cf, + "error" => ?e + ); + }); + fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( + "aborted by failpoint".into() + ))); + self.metrics.size_diff_hint += (self.key_buffer.len() + value.len()) as i64; + if index != u64::MAX { + self.modifications_mut()[off] = index; + } + Ok(()) + } + + #[inline] + pub fn apply_delete(&mut self, cf: &str, index: u64, key: &[u8]) -> Result<()> { + PEER_WRITE_CMD_COUNTER.delete.inc(); + let off = data_cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } + util::check_key_in_region(key, self.region())?; + if let Some(s) = self.buckets.as_mut() { + s.write_key(key, 0); + } + keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); + let res = if cf.is_empty() || cf == CF_DEFAULT { + // TODO: use write_vector + self.write_batch.as_mut().unwrap().delete(&self.key_buffer) + } else { + self.write_batch + .as_mut() + .unwrap() + .delete_cf(cf, &self.key_buffer) + }; + res.unwrap_or_else(|e| { + slog_panic!( + self.logger, + "failed to delete"; + "key" => %log_wrappers::Value::key(key), + "cf" => cf, + "error" => ?e + ); + }); + self.metrics.size_diff_hint -= self.key_buffer.len() as i64; + if index != u64::MAX { + self.modifications_mut()[off] = index; + } + Ok(()) + } + + #[inline] + pub fn apply_delete_range( + &mut self, + _cf: &str, + _index: u64, + _start_key: &[u8], + _end_key: &[u8], + _notify_only: bool, + ) -> Result<()> { + // TODO: reuse the same delete as split/merge. + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs new file mode 100644 index 00000000000..8b431ad3a98 --- /dev/null +++ b/components/raftstore-v2/src/operation/life.rs @@ -0,0 +1,757 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the creation and destruction of peer. +//! +//! A peer can only be created by either: +//! - bootstrapping a cluster, it's coverred in crate::bootstrap; +//! - receiving a RaftMessage. +//! +//! In v1, it can also be created by split. In v2, it's required to create by +//! sending a message to store fsm first, and then using split to initialized +//! the peer. +//! +//! A peer can only be removed in a raft group by conf change or merge. When +//! applying conf change, removed peer is added to `removed_records`; when +//! applying merge, source peer is added to merged_records. Quorum must agree +//! on the removal, but the removed peer may not necessary be in the quorum. So +//! the peer may not really destroy itself until either: +//! - applying conf change remove; +//! - receiving a RaftMessage with `is_tombstone` set; +//! - receiving a RaftMessage targeting larger ID. +//! +//! Leader is responsible to keep polling all removed peers and guarantee they +//! are really destroyed. A peer is considered destroyed only when a tombstone +//! record with the same ID or larger ID is persisted. For `removed_records`, +//! leader only needs to send a message with `is_tombstone` set. For +//! `merged_records`, to avoid race between destroy and merge, leader needs to +//! ask target peer to destroy source peer. + +use std::{cmp, mem}; + +use batch_system::BasicMailbox; +use crossbeam::channel::{SendError, TrySendError}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use kvproto::{ + metapb::{self, Region}, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, +}; +use raftstore::store::{ + fsm::life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, + metrics::RAFT_PEER_PENDING_DURATION, + util, Transport, WriteTask, +}; +use slog::{debug, error, info, warn}; +use tikv_util::{ + store::find_peer, + time::{duration_to_sec, Instant}, +}; + +use super::command::SplitInit; +use crate::{ + batch::StoreContext, + fsm::{PeerFsm, Store}, + operation::command::report_split_init_finish, + raft::{Peer, Storage}, + router::{CmdResChannel, PeerMsg, PeerTick}, +}; + +/// When a peer is about to destroy, it becomes `WaitReady` first. If there is +/// no pending asynchronous apply, it becomes `Destroying` and then start +/// destroying asynchronously during handling ready. After the asynchronously +/// destroying is finished, it becomes `Destroyed`. +pub enum DestroyProgress { + /// Alive means destroy is not triggered at all. It's the same as None for + /// `Option`. Not using Option to avoid unwrap everywhere. + None, + /// If the destroy is triggered by message, then the message will be used + /// for creating new peer immediately. + WaitReady(Option>), + Destroying(Option>), + Destroyed, +} + +impl DestroyProgress { + #[inline] + pub fn started(&self) -> bool { + matches!( + self, + DestroyProgress::Destroying(_) | DestroyProgress::Destroyed + ) + } + + #[inline] + pub fn waiting(&self) -> bool { + matches!(self, DestroyProgress::WaitReady(_)) + } + + #[inline] + fn start(&mut self) { + match self { + DestroyProgress::WaitReady(msg) => *self = DestroyProgress::Destroying(msg.take()), + _ => panic!("must wait ready first to start destroying"), + } + } + + #[inline] + fn wait_with(&mut self, triggered_msg: Option>) { + match self { + DestroyProgress::None => *self = DestroyProgress::WaitReady(triggered_msg), + _ => panic!("must be alive to wait"), + } + } + + #[inline] + fn finish(&mut self) -> Option> { + match self { + DestroyProgress::Destroying(msg) => { + let msg = msg.take(); + *self = DestroyProgress::Destroyed; + msg + } + _ => panic!("must be destroying to finish"), + } + } +} + +#[derive(Default)] +pub struct AbnormalPeerContext { + /// Record the instants of peers being added into the configuration. + /// Remove them after they are not pending any more. + /// (u64, Instant) represents (peer id, time when peer starts pending) + pending_peers: Vec<(u64, Instant)>, + /// A inaccurate cache about which peer is marked as down. + down_peers: Vec, +} + +impl AbnormalPeerContext { + #[inline] + pub fn is_empty(&self) -> bool { + self.pending_peers.is_empty() && self.down_peers.is_empty() + } + + #[inline] + pub fn reset(&mut self) { + self.pending_peers.clear(); + self.down_peers.clear(); + } + + #[inline] + pub fn down_peers(&self) -> &[u64] { + &self.down_peers + } + + #[inline] + pub fn down_peers_mut(&mut self) -> &mut Vec { + &mut self.down_peers + } + + #[inline] + pub fn pending_peers(&self) -> &[(u64, Instant)] { + &self.pending_peers + } + + #[inline] + pub fn pending_peers_mut(&mut self) -> &mut Vec<(u64, Instant)> { + &mut self.pending_peers + } + + #[inline] + pub fn retain_pending_peers(&mut self, f: impl FnMut(&mut (u64, Instant)) -> bool) -> bool { + let len = self.pending_peers.len(); + self.pending_peers.retain_mut(f); + len != self.pending_peers.len() + } + + #[inline] + pub fn flush_metrics(&self) { + let _ = self.pending_peers.iter().map(|(_, pending_after)| { + let elapsed = duration_to_sec(pending_after.saturating_elapsed()); + RAFT_PEER_PENDING_DURATION.observe(elapsed); + }); + } +} + +#[derive(Default)] +pub struct GcPeerContext { + confirmed_ids: Vec, +} + +fn check_if_to_peer_destroyed( + engine: &ER, + msg: &RaftMessage, + store_id: u64, +) -> engine_traits::Result { + let region_id = msg.get_region_id(); + let to_peer = msg.get_to_peer(); + let local_state = match engine.get_region_state(region_id, u64::MAX)? { + Some(s) => s, + None => return Ok(false), + }; + // Split will not create peer in v2, so the state must be Tombstone. + if local_state.get_state() != PeerState::Tombstone { + panic!( + "[region {}] {} peer doesn't exist but has valid local state {:?}", + region_id, to_peer.id, local_state + ); + } + // Compared to v1, we rely on leader to confirm destroy actively, so here + // skip handling gc for simplicity. + let local_epoch = local_state.get_region().get_region_epoch(); + // The region in this peer is already destroyed + if util::is_epoch_stale(msg.get_region_epoch(), local_epoch) { + return Ok(true); + } + if let Some(local_peer) = find_peer(local_state.get_region(), store_id) && to_peer.id <= local_peer.get_id() { + return Ok(true); + } + // If the peer is destroyed by conf change, all above checks will pass. + if local_state + .get_removed_records() + .iter() + .find(|p| p.get_store_id() == store_id) + .map_or(false, |p| to_peer.id <= p.get_id()) + { + return Ok(true); + } + Ok(false) +} + +// An empty raft message for creating peer fsm. +fn empty_split_message(store_id: u64, region: &Region) -> Box { + let mut raft_msg = Box::::default(); + raft_msg.set_region_id(region.get_id()); + raft_msg.set_region_epoch(region.get_region_epoch().clone()); + raft_msg.set_to_peer( + region + .get_peers() + .iter() + .find(|p| p.get_store_id() == store_id) + .unwrap() + .clone(), + ); + raft_msg +} + +pub fn is_empty_split_message(msg: &RaftMessage) -> bool { + !msg.has_from_peer() && msg.has_to_peer() && msg.has_region_epoch() && !msg.has_message() +} + +impl Store { + /// The method is called during split. + /// The creation process is: + /// 1. create an uninitialized peer if not existed before + /// 2. initialize the peer by the information sent from parent peer + #[inline] + pub fn on_split_init( + &mut self, + ctx: &mut StoreContext, + msg: Box, + ) where + EK: KvEngine, + ER: RaftEngine, + T: Transport, + { + let derived_region_id = msg.derived_region_id; + let region_id = msg.region.id; + let raft_msg = empty_split_message(self.store_id(), &msg.region); + + (|| { + fail::fail_point!( + "on_store_2_split_init_race_with_initial_message", + self.store_id() == 2, + |_| { + let mut initial_msg = raft_msg.clone(); + initial_msg.set_from_peer( + msg.region + .get_peers() + .iter() + .find(|p| p.get_store_id() != self.store_id()) + .unwrap() + .clone(), + ); + let m = initial_msg.mut_message(); + m.set_msg_type(raft::prelude::MessageType::MsgRequestPreVote); + m.set_term(raftstore::store::RAFT_INIT_LOG_TERM); + m.set_index(raftstore::store::RAFT_INIT_LOG_INDEX); + assert!(util::is_initial_msg(initial_msg.get_message())); + self.on_raft_message(ctx, initial_msg); + } + ) + })(); + + // It will create the peer if it does not exist + self.on_raft_message(ctx, raft_msg); + + if let Err(SendError(m)) = ctx.router.force_send(region_id, PeerMsg::SplitInit(msg)) { + warn!( + self.logger(), + "Split peer is destroyed before sending the intialization msg"; + "split init msg" => ?m, + ); + report_split_init_finish(ctx, derived_region_id, region_id, true); + } + } + + #[inline] + pub fn on_ask_commit_merge( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ) where + EK: KvEngine, + ER: RaftEngine, + T: Transport, + { + let region_id = req.get_header().get_region_id(); + let mut raft_msg = Box::::default(); + raft_msg.set_region_id(region_id); + raft_msg.set_region_epoch(req.get_header().get_region_epoch().clone()); + raft_msg.set_to_peer(req.get_header().get_peer().clone()); + + // It will create the peer if it does not exist + self.on_raft_message(ctx, raft_msg); + + if let Err(SendError(PeerMsg::AskCommitMerge(req))) = ctx + .router + .force_send(region_id, PeerMsg::AskCommitMerge(req)) + { + let commit_merge = req.get_admin_request().get_commit_merge(); + let source_id = commit_merge.get_source().get_id(); + let _ = ctx.router.force_send( + source_id, + PeerMsg::RejectCommitMerge { + index: commit_merge.get_commit(), + }, + ); + } + } + + /// When a message's recipient doesn't exist, it will be redirected to + /// store. Store is responsible for checking if it's neccessary to create + /// a peer to handle the message. + #[inline] + pub fn on_raft_message( + &mut self, + ctx: &mut StoreContext, + msg: Box, + ) where + EK: KvEngine, + ER: RaftEngine, + T: Transport, + { + let region_id = msg.get_region_id(); + // The message can be sent when the peer is being created, so try send it first. + let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = + ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) + { + m + } else { + return; + }; + let from_peer = msg.get_from_peer(); + let to_peer = msg.get_to_peer(); + // Now the peer should not exist. + debug!( + self.logger(), + "handle raft message"; + "from_peer_id" => from_peer.id, + "to_peer_id" => to_peer.id, + "region_id" => region_id, + "msg_type" => %util::MsgType(&msg) + ); + if to_peer.store_id != self.store_id() { + ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); + return; + } + if !msg.has_region_epoch() { + ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); + return; + } + if msg.has_merge_target() { + // Target tombstone peer doesn't exist, so ignore it. + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + let destroyed = match check_if_to_peer_destroyed(&ctx.engine, &msg, self.store_id()) { + Ok(d) => d, + Err(e) => { + error!(self.logger(), "failed to get region state"; "region_id" => region_id, "err" => ?e); + return; + } + }; + if destroyed { + if msg.get_is_tombstone() { + if let Some(msg) = build_peer_destroyed_report(&mut msg) { + let _ = ctx.trans.send(msg); + } + return; + } + if msg.has_extra_msg() { + let extra_msg = msg.get_extra_msg(); + // Only the direct request has `is_tombstone` set to false. We are certain this + // message needs to be forwarded. + if extra_msg.get_type() == ExtraMessageType::MsgGcPeerRequest + && extra_msg.has_check_gc_peer() + { + forward_destroy_to_source_peer(&msg, |m| { + let _ = ctx.router.send_raft_message(m.into()); + }); + return; + } + } + ctx.raft_metrics.message_dropped.region_tombstone_peer.inc(); + return; + } + // If it's not destroyed, and the message is a tombstone message, create the + // peer and destroy immediately to leave a tombstone record. + + // So the peer must need to be created. We don't need to synchronous with split + // as split won't create peer in v2. And we don't check for range + // conflict as v2 depends on tablet, which allows conflict ranges. + let mut region = Region::default(); + region.set_id(region_id); + region.set_region_epoch(msg.get_region_epoch().clone()); + + // Peer list doesn't have to be complete, as it's uninitialized. + // + // If the id of the from_peer is INVALID_ID, this msg must be sent from parent + // peer in the split execution in which case we do not add it into the region. + if from_peer.id != raft::INVALID_ID + // Check merge may be sent from different region + && (msg.get_extra_msg().get_type() != ExtraMessageType::MsgGcPeerRequest + || msg.get_extra_msg().get_check_gc_peer().get_from_region_id() == region_id) + { + region.mut_peers().push(from_peer.clone()); + } + region.mut_peers().push(to_peer.clone()); + // We don't set the region range here as we allow range conflict. + let (tx, fsm) = match Storage::uninit( + self.store_id(), + region, + ctx.engine.clone(), + ctx.schedulers.read.clone(), + &ctx.logger, + ) + .and_then(|s| { + PeerFsm::new( + &ctx.cfg, + &ctx.tablet_registry, + ctx.key_manager.as_deref(), + &ctx.snap_mgr, + s, + ) + }) { + Ok(p) => p, + res => { + error!(self.logger(), "failed to create peer"; "region_id" => region_id, "peer_id" => to_peer.id, "err" => ?res.err()); + return; + } + }; + ctx.store_meta + .lock() + .unwrap() + .set_region(fsm.peer().region(), false, fsm.logger()); + let mailbox = BasicMailbox::new(tx, fsm, ctx.router.state_cnt().clone()); + if ctx + .router + .send_and_register(region_id, mailbox, PeerMsg::Start) + .is_err() + { + panic!( + "[region {}] {} failed to register peer", + region_id, to_peer.id + ); + } + // Only forward valid message. Split may use a message without sender to trigger + // creating a peer. + if from_peer.id != raft::INVALID_ID { + // For now the peer only exists in memory. It will persist its states when + // handling its first readiness. + let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg)); + } + } +} + +impl Peer { + pub fn on_availability_request( + &mut self, + ctx: &mut StoreContext, + from_region_id: u64, + from_peer: &metapb::Peer, + ) { + let mut msg = RaftMessage::default(); + msg.set_region_id(from_region_id); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(from_peer.clone()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgAvailabilityResponse); + let report = msg.mut_extra_msg().mut_availability_context(); + report.set_from_region_id(self.region_id()); + report.set_from_region_epoch(self.region().get_region_epoch().clone()); + report.set_trimmed(!self.storage().has_dirty_data()); + let _ = ctx.trans.send(msg); + } + + #[inline] + pub fn on_availability_response( + &mut self, + ctx: &mut StoreContext, + from_peer: u64, + resp: &ExtraMessage, + ) { + self.merge_on_availability_response(ctx, from_peer, resp); + } + + pub fn maybe_schedule_gc_peer_tick(&mut self) { + let region_state = self.storage().region_state(); + if !region_state.get_removed_records().is_empty() + || !region_state.get_merged_records().is_empty() + { + self.add_pending_tick(PeerTick::GcPeer); + } + } + + /// Returns `true` means the sender will be gced. The message is stale. + pub fn maybe_gc_sender(&mut self, msg: &RaftMessage) -> bool { + let removed_peers = self.storage().region_state().get_removed_records(); + // Only removed_records can be determined directly. + if let Some(peer) = removed_peers + .iter() + .find(|p| p.id == msg.get_from_peer().get_id()) + { + let tombstone_msg = self.tombstone_message_for_same_region(peer.clone()); + self.add_message(tombstone_msg); + true + } else { + false + } + } + + fn tombstone_message_for_same_region(&self, peer: metapb::Peer) -> RaftMessage { + let region_id = self.region_id(); + let mut tombstone_message = RaftMessage::default(); + tombstone_message.set_region_id(region_id); + tombstone_message.set_from_peer(self.peer().clone()); + tombstone_message.set_to_peer(peer); + tombstone_message.set_region_epoch(self.region().get_region_epoch().clone()); + tombstone_message.set_is_tombstone(true); + tombstone_message + } + + pub fn on_tombstone_message(&mut self, msg: &mut RaftMessage) { + match msg.get_to_peer().get_id().cmp(&self.peer_id()) { + cmp::Ordering::Less => { + if let Some(msg) = build_peer_destroyed_report(msg) { + self.add_message(msg); + } + } + // No matter it's greater or equal, the current peer must be destroyed. + _ => { + self.mark_for_destroy(None); + } + } + } + + /// When leader tries to gc merged source peer, it will send a gc request to + /// target peer. If target peer makes sure the merged is finished, it + /// forward the message to source peer and let source peer send back a + /// response. + pub fn on_gc_peer_request( + &mut self, + ctx: &mut StoreContext, + msg: &RaftMessage, + ) { + let extra_msg = msg.get_extra_msg(); + if !extra_msg.has_check_gc_peer() || extra_msg.get_index() == 0 { + // Corrupted message. + return; + } + if self.storage().tablet_index() < extra_msg.get_index() { + // Merge not finish. + return; + } + + forward_destroy_to_source_peer(msg, |m| { + let _ = ctx.router.send_raft_message(m.into()); + }); + } + + /// A peer confirms it's destroyed. + pub fn on_gc_peer_response(&mut self, msg: &RaftMessage) { + let gc_peer_id = msg.get_from_peer().get_id(); + let state = self.storage().region_state(); + if state + .get_removed_records() + .iter() + .all(|p| p.get_id() != gc_peer_id) + && state.get_merged_records().iter().all(|p| { + p.get_source_peers() + .iter() + .all(|p| p.get_id() != gc_peer_id) + }) + { + return; + } + let ctx = self.gc_peer_context_mut(); + if ctx.confirmed_ids.contains(&gc_peer_id) { + return; + } + ctx.confirmed_ids.push(gc_peer_id); + } + + pub fn on_gc_peer_tick(&mut self, ctx: &mut StoreContext) { + if !self.is_leader() { + return; + } + let state = self.storage().region_state(); + if state.get_removed_records().is_empty() && state.get_merged_records().is_empty() { + return; + } + let mut need_gc_ids = Vec::with_capacity(5); + let gc_context = self.gc_peer_context(); + for peer in state.get_removed_records() { + need_gc_ids.push(peer.get_id()); + if gc_context.confirmed_ids.contains(&peer.get_id()) { + continue; + } + + let msg = self.tombstone_message_for_same_region(peer.clone()); + // For leader, it's OK to send gc message immediately. + let _ = ctx.trans.send(msg); + } + for record in state.get_merged_records() { + // For merge, we ask target to check whether source should be deleted. + for (source, target) in record + .get_source_peers() + .iter() + .zip(record.get_target_peers()) + { + need_gc_ids.push(source.get_id()); + if gc_context.confirmed_ids.contains(&source.get_id()) { + continue; + } + + let mut msg = RaftMessage::default(); + msg.set_region_id(record.get_target_region_id()); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(target.clone()); + msg.set_region_epoch(record.get_target_epoch().clone()); + let extra_msg = msg.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgGcPeerRequest); + extra_msg.set_index(record.get_index()); + let check_peer = extra_msg.mut_check_gc_peer(); + check_peer.set_from_region_id(self.region_id()); + check_peer.set_check_region_id(record.get_source_region_id()); + check_peer.set_check_peer(source.clone()); + check_peer.set_check_region_epoch(record.get_source_epoch().clone()); + let _ = ctx.trans.send(msg); + } + } + let gc_ctx = self.gc_peer_context_mut(); + if !gc_ctx.confirmed_ids.is_empty() { + let mut confirmed_ids = mem::take(&mut gc_ctx.confirmed_ids); + confirmed_ids.retain(|id| need_gc_ids.contains(id)); + let mut req = RaftCmdRequest::default(); + let header = req.mut_header(); + header.set_region_id(self.region_id()); + header.set_peer(self.peer().clone()); + let admin = req.mut_admin_request(); + admin.set_cmd_type(AdminCmdType::UpdateGcPeer); + let gc_peer = admin.mut_update_gc_peers(); + gc_peer.set_peer_id(confirmed_ids); + let (ch, _) = CmdResChannel::pair(); + // It's OK to fail as we will retry by tick. + self.on_admin_command(ctx, req, ch); + } + self.maybe_schedule_gc_peer_tick(); + } + + /// A peer can be destroyed in three cases: + /// 1. Received a gc message; + /// 2. Received a message whose target peer's ID is larger than this; + /// 3. Applied a conf remove self command. + /// In all cases, the peer will be destroyed asynchronousely in next + /// handle_raft_ready. + /// `triggered_msg` will be sent to store fsm after destroy is finished. + /// Should set the message only when the target peer is supposed to be + /// created afterward. + pub fn mark_for_destroy(&mut self, triggered_msg: Option>) { + if self.serving() { + self.destroy_progress_mut().wait_with(triggered_msg); + self.set_has_ready(); + } + } + + /// In v2, it's possible to destroy the peer without waiting for apply. But + /// we better wait till all previous entries are applied in case there + /// are split. It's a waste to use snapshot to restore newly split + /// tablet. + #[inline] + pub fn postponed_destroy(&self) -> bool { + let last_applying_index = self.compact_log_context().last_applying_index(); + let entry_storage = self.storage().entry_storage(); + // If it's marked as tombstone, then it must be changed by conf change. In + // this case, all following entries are skipped so applied_index never equals + // to last_applying_index. + (self.storage().region_state().get_state() != PeerState::Tombstone + && entry_storage.applied_index() != last_applying_index) + // Wait for critical commands like split. + || self.has_pending_tombstone_tablets() + } + + /// Start the destroy progress. It will write `Tombstone` state + /// asynchronously. + /// + /// After destroy is finished, `finish_destroy` should be called to clean up + /// memory states. + pub fn start_destroy( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + ) { + if self.postponed_destroy() { + return; + } + let raft_engine = self.entry_storage().raft_engine(); + let mut region_state = self.storage().region_state().clone(); + let region_id = region_state.get_region().get_id(); + // Use extra write to ensure these writes are the last writes to raft engine. + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(2)); + // We only use raft-log-engine for v2, first index and state are not important. + let raft_state = self.entry_storage().raft_state(); + raft_engine.clean(region_id, 0, raft_state, lb).unwrap(); + region_state.set_state(PeerState::Tombstone); + let applied_index = self.entry_storage().applied_index(); + lb.put_region_state(region_id, applied_index, ®ion_state) + .unwrap(); + self.record_tombstone_tablet_for_destroy(ctx, write_task); + self.destroy_progress_mut().start(); + } + + /// Do clean up for destroy. The peer is permanently destroyed when + /// Tombstone state is persisted. This method is only for cleaning up + /// memory states. + pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { + info!(self.logger, "peer destroyed"); + let region_id = self.region_id(); + { + let mut meta = ctx.store_meta.lock().unwrap(); + meta.remove_region(region_id); + meta.readers.remove(®ion_id); + ctx.tablet_registry.remove(region_id); + } + // Remove tablet first, otherwise in extreme cases, a new peer can be created + // and race on tablet record removal and creation. + ctx.router.close(region_id); + if let Some(msg) = self.destroy_progress_mut().finish() { + // The message will be dispatched to store fsm, which will create a + // new peer. Ignore error as it's just a best effort. + let _ = ctx.router.send_raft_message(msg); + } + self.pending_reads_mut().clear_all(Some(region_id)); + self.clear_apply_scheduler(); + } +} diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs new file mode 100644 index 00000000000..f5eb4ebdb6f --- /dev/null +++ b/components/raftstore-v2/src/operation/mod.rs @@ -0,0 +1,44 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod bucket; +mod command; +mod life; +mod pd; +mod query; +mod ready; +mod txn_ext; + +pub use command::{ + AdminCmdResult, ApplyFlowControl, CatchUpLogs, CommittedEntries, CompactLogContext, + MergeContext, ProposalControl, RequestHalfSplit, RequestSplit, SimpleWriteBinary, + SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, + MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, +}; +pub use life::{AbnormalPeerContext, DestroyProgress, GcPeerContext}; +pub use ready::{ + write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, StateStorage, +}; + +pub(crate) use self::{ + bucket::BucketStatsInfo, + command::SplitInit, + query::{LocalReader, ReadDelegatePair, SharedReadTablet}, + txn_ext::TxnContext, +}; + +#[cfg(test)] +pub mod test_util { + use std::sync::Arc; + + use kvproto::kvrpcpb::ApiVersion; + use sst_importer::SstImporter; + use tempfile::TempDir; + + pub fn create_tmp_importer() -> (TempDir, Arc) { + let dir = TempDir::new().unwrap(); + let importer = Arc::new( + SstImporter::new(&Default::default(), dir.path(), None, ApiVersion::V1).unwrap(), + ); + (dir, importer) + } +} diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs new file mode 100644 index 00000000000..7ad82959fa8 --- /dev/null +++ b/components/raftstore-v2/src/operation/pd.rs @@ -0,0 +1,249 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the interactions with pd. + +use std::sync::atomic::Ordering; + +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::{metapb, pdpb}; +use raftstore::store::{metrics::STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC, Transport}; +use slog::{debug, error}; +use tikv_util::{slog_panic, time::Instant}; + +use crate::{ + batch::StoreContext, + fsm::{PeerFsmDelegate, Store, StoreFsmDelegate}, + raft::Peer, + router::{CmdResChannel, PeerTick, StoreTick}, + worker::pd, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_pd_store_heartbeat(&mut self) { + self.fsm.store.store_heartbeat_pd(self.store_ctx); + self.schedule_tick( + StoreTick::PdStoreHeartbeat, + self.store_ctx.cfg.pd_store_heartbeat_tick_interval.0, + ); + } +} + +impl Store { + pub fn store_heartbeat_pd(&self, ctx: &StoreContext) + where + EK: KvEngine, + ER: RaftEngine, + { + let mut stats = pdpb::StoreStats::default(); + + stats.set_store_id(self.store_id()); + { + let meta = ctx.store_meta.lock().unwrap(); + stats.set_region_count(meta.readers.len() as u32); + } + + let snap_stats = ctx.snap_mgr.stats(); + // todo: imple snapshot status report + stats.set_sending_snap_count(snap_stats.sending_count as u32); + stats.set_receiving_snap_count(snap_stats.receiving_count as u32); + stats.set_snapshot_stats(snap_stats.stats.into()); + + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["sending"]) + .set(stats.get_sending_snap_count() as i64); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["receiving"]) + .set(stats.get_receiving_snap_count() as i64); + + stats.set_start_time(self.start_time().unwrap() as u32); + + stats.set_bytes_written( + ctx.global_stat + .stat + .engine_total_bytes_written + .swap(0, Ordering::Relaxed), + ); + stats.set_keys_written( + ctx.global_stat + .stat + .engine_total_keys_written + .swap(0, Ordering::Relaxed), + ); + stats.set_is_busy(false); + // TODO: add query stats + let task = pd::Task::StoreHeartbeat { stats }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!(self.logger(), "notify pd failed"; + "store_id" => self.store_id(), + "err" => ?e + ); + } + } +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_pd_heartbeat(&mut self) { + self.fsm.peer_mut().update_peer_statistics(); + if self.fsm.peer().is_leader() { + self.fsm.peer_mut().region_heartbeat_pd(self.store_ctx); + } + // TODO: hibernate region + self.schedule_tick(PeerTick::PdHeartbeat); + } +} + +impl Peer { + #[inline] + pub fn region_heartbeat_pd(&mut self, ctx: &StoreContext) { + let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { + term: self.term(), + region: self.region().clone(), + down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), + peer: self.peer().clone(), + pending_peers: self.collect_pending_peers(ctx), + written_bytes: self.self_stat().written_bytes, + written_keys: self.self_stat().written_keys, + approximate_size: self.split_flow_control_mut().approximate_size(), + approximate_keys: self.split_flow_control_mut().approximate_keys(), + wait_data_peers: Vec::new(), + }); + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!( + self.logger, + "failed to notify pd"; + "err" => ?e, + ); + return; + } + fail_point!("schedule_check_split"); + } + + /// Collects all pending peers and update `peers_start_pending_time`. + fn collect_pending_peers(&mut self, ctx: &StoreContext) -> Vec { + let mut pending_peers = Vec::with_capacity(self.region().get_peers().len()); + let status = self.raft_group().status(); + let truncated_idx = self + .storage() + .apply_state() + .get_truncated_state() + .get_index(); + + if status.progress.is_none() { + return pending_peers; + } + + self.abnormal_peer_context().flush_metrics(); + + let progresses = status.progress.unwrap().iter(); + let mut peers_start_pending_time = Vec::with_capacity(self.region().get_peers().len()); + for (&id, progress) in progresses { + if id == self.peer_id() { + continue; + } + // The `matched` is 0 only in these two cases: + // 1. Current leader hasn't communicated with this peer. + // 2. This peer does not exist yet(maybe it is created but not initialized) + // + // The correctness of region merge depends on the fact that all target peers + // must exist during merging. (PD rely on `pending_peers` to check whether all + // target peers exist) + // + // So if the `matched` is 0, it must be a pending peer. + // It can be ensured because `truncated_index` must be greater than + // `RAFT_INIT_LOG_INDEX`(5). + if progress.matched < truncated_idx { + if let Some(p) = self.peer_from_cache(id) { + pending_peers.push(p); + if !self + .abnormal_peer_context() + .pending_peers() + .iter() + .any(|p| p.0 == id) + { + let now = Instant::now(); + peers_start_pending_time.push((id, now)); + debug!( + self.logger, + "peer start pending"; + "get_peer_id" => id, + "time" => ?now, + ); + } + } else { + if ctx.cfg.dev_assert { + slog_panic!( + self.logger, + "failed to get peer from cache"; + "get_peer_id" => id + ); + } + error!( + self.logger, + "failed to get peer from cache"; + "get_peer_id" => id, + ); + } + } + } + self.abnormal_peer_context_mut() + .pending_peers_mut() + .append(&mut peers_start_pending_time); + pending_peers + } + + #[inline] + pub fn destroy_peer_pd(&self, ctx: &StoreContext) { + let task = pd::Task::DestroyPeer { + region_id: self.region_id(), + }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!( + self.logger, + "failed to notify pd with DestroyPeer"; + "err" => %e, + ); + } + } + + #[inline] + pub fn ask_batch_split_pd( + &self, + ctx: &StoreContext, + split_keys: Vec>, + ch: CmdResChannel, + ) { + let task = pd::Task::AskBatchSplit { + region: self.region().clone(), + split_keys, + peer: self.peer().clone(), + right_derive: ctx.cfg.right_derive_when_split, + ch, + }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!( + self.logger, + "failed to notify pd with AskBatchSplit"; + "err" => %e, + ); + } + } + + #[inline] + pub fn report_batch_split_pd( + &self, + ctx: &StoreContext, + regions: Vec, + ) { + let task = pd::Task::ReportBatchSplit { regions }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!( + self.logger, + "failed to notify pd with ReportBatchSplit"; + "err" => %e, + ); + } + } +} diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs new file mode 100644 index 00000000000..5393dfacc98 --- /dev/null +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -0,0 +1,413 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; +use raftstore::{ + coprocessor::{Cmd, CmdBatch, ObserveHandle, ObserveLevel}, + store::{ + cmd_resp, + fsm::{ + apply::{notify_stale_req_with_msg, ObserverType, SHRINK_PENDING_CMD_QUEUE_CAP}, + new_read_index_request, ChangeObserver, + }, + msg::ErrorCallback, + util::compare_region_epoch, + RegionSnapshot, + }, +}; +use slog::info; + +use crate::{ + fsm::{ApplyResReporter, PeerFsmDelegate}, + raft::Apply, + router::{message::CaptureChange, ApplyTask, QueryResChannel, QueryResult}, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> + PeerFsmDelegate<'a, EK, ER, T> +{ + pub fn on_leader_callback(&mut self, ch: QueryResChannel) { + let peer = self.fsm.peer(); + let msg = new_read_index_request( + peer.region_id(), + peer.region().get_region_epoch().clone(), + peer.peer().clone(), + ); + self.on_query(msg, ch); + } + + pub fn on_capture_change(&mut self, capture_change: CaptureChange) { + fail_point!("raft_on_capture_change"); + + // TODO: Allow to capture change even is in flashback state. + // TODO: add a test case for this kind of situation. + + let apply_router = self.fsm.peer().apply_scheduler().unwrap().clone(); + let (ch, _) = QueryResChannel::with_callback(Box::new(move |res| { + if let QueryResult::Response(resp) = res && resp.get_header().has_error() { + // Return error + capture_change.snap_cb.report_error(resp.clone()); + return; + } + apply_router.send(ApplyTask::CaptureApply(capture_change)) + })); + self.on_leader_callback(ch); + } +} + +impl Apply { + pub fn on_capture_apply(&mut self, capture_change: CaptureChange) { + let CaptureChange { + observer, + region_epoch, + snap_cb, + } = capture_change; + let ChangeObserver { region_id, ty } = observer; + + let is_stale_cmd = match ty { + ObserverType::Cdc(ObserveHandle { id, .. }) => self.observe().info.cdc_id.id > id, + ObserverType::Rts(ObserveHandle { id, .. }) => self.observe().info.rts_id.id > id, + ObserverType::Pitr(ObserveHandle { id, .. }) => self.observe().info.pitr_id.id > id, + }; + if is_stale_cmd { + notify_stale_req_with_msg( + self.term(), + format!( + "stale observe id {:?}, current id: {:?}", + ty.handle().id, + self.observe().info, + ), + snap_cb, + ); + return; + } + + assert_eq!(self.region_id(), region_id); + let snapshot = match compare_region_epoch( + ®ion_epoch, + self.region(), + false, // check_conf_ver + true, // check_ver + true, // include_region + ) { + Ok(()) => { + // Commit the writebatch for ensuring the following snapshot can get all + // previous writes. + self.flush(); + let (applied_index, _) = self.apply_progress(); + let snap = RegionSnapshot::from_snapshot( + Arc::new(self.tablet().snapshot()), + Arc::new(self.region().clone()), + ); + snap.set_apply_index(applied_index); + snap + } + Err(e) => { + // Return error if epoch not match + snap_cb.report_error(cmd_resp::new_error(e)); + return; + } + }; + + let observe = self.observe_mut(); + match ty { + ObserverType::Cdc(id) => { + observe.info.cdc_id = id; + } + ObserverType::Rts(id) => { + observe.info.rts_id = id; + } + ObserverType::Pitr(id) => { + observe.info.pitr_id = id; + } + } + let level = observe.info.observe_level(); + observe.level = level; + info!(self.logger, "capture update observe level"; "level" => ?level); + snap_cb.set_result((RaftCmdResponse::default(), Some(Box::new(snapshot)))); + } + + pub fn observe_apply( + &mut self, + index: u64, + term: u64, + req: RaftCmdRequest, + resp: &RaftCmdResponse, + ) { + if self.observe().level == ObserveLevel::None { + return; + } + + let cmd = Cmd::new(index, term, req, resp.clone()); + self.observe_mut().cmds.push(cmd); + } + + pub fn flush_observed_apply(&mut self) { + let level = self.observe().level; + if level == ObserveLevel::None { + return; + } + + let region_id = self.region_id(); + let observe = self.observe_mut(); + let mut cmd_batch = CmdBatch::new(&observe.info, region_id); + cmd_batch.extend(&observe.info, region_id, observe.cmds.drain(..)); + if observe.cmds.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { + observe.cmds.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); + } + self.coprocessor_host() + .on_flush_applied_cmd_batch(level, vec![cmd_batch], self.tablet()); + } +} + +#[cfg(test)] +mod test { + use std::sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, Mutex, + }; + + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactory}, + }; + use engine_traits::{ + FlushState, Peekable, TabletContext, TabletRegistry, CF_DEFAULT, DATA_CFS, + }; + use futures::executor::block_on; + use kvproto::{ + metapb::{Region, RegionEpoch}, + raft_cmdpb::RaftRequestHeader, + raft_serverpb::{PeerState, RegionLocalState}, + }; + use raft::{ + prelude::{Entry, EntryType}, + StateRole, + }; + use raftstore::{ + coprocessor::{BoxCmdObserver, CmdObserver, CoprocessorHost}, + store::Config, + }; + use slog::o; + use tempfile::TempDir; + use tikv_util::{store::new_peer, worker::dummy_scheduler}; + + use super::*; + use crate::{ + fsm::ApplyResReporter, + operation::{ + test_util::create_tmp_importer, CatchUpLogs, CommittedEntries, SimpleWriteReqEncoder, + }, + raft::Apply, + router::{build_any_channel, ApplyRes}, + SimpleWriteEncoder, + }; + + struct MockReporter { + sender: Sender, + } + + impl MockReporter { + fn new() -> (Self, Receiver) { + let (tx, rx) = channel(); + (MockReporter { sender: tx }, rx) + } + } + + impl ApplyResReporter for MockReporter { + fn report(&self, apply_res: ApplyRes) { + let _ = self.sender.send(apply_res); + } + + fn redirect_catch_up_logs(&self, _c: CatchUpLogs) {} + } + + #[derive(Clone)] + struct TestObserver { + sender: Sender>, + } + + impl TestObserver { + fn new() -> (Self, Receiver>) { + let (tx, rx) = channel(); + (TestObserver { sender: tx }, rx) + } + } + + impl raftstore::coprocessor::Coprocessor for TestObserver {} + impl CmdObserver for TestObserver { + fn on_flush_applied_cmd_batch( + &self, + _max_level: ObserveLevel, + cmd_batches: &mut Vec, + _engine: &E, + ) { + self.sender.send(cmd_batches.clone()).unwrap(); + } + + fn on_applied_current_term(&self, _: StateRole, _: &Region) {} + } + + fn new_put_entry( + region_id: u64, + region_epoch: RegionEpoch, + k: &[u8], + v: &[u8], + term: u64, + index: u64, + ) -> Entry { + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, k, v); + let mut header = Box::::default(); + header.set_region_id(region_id); + header.set_region_epoch(region_epoch); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let (bin, _) = req_encoder.encode(); + let mut e = Entry::default(); + e.set_entry_type(EntryType::EntryNormal); + e.set_term(term); + e.set_index(index); + e.set_data(bin.into()); + e + } + + #[test] + fn test_capture_apply() { + let store_id = 2; + + let mut region = Region::default(); + region.set_id(1); + region.set_end_key(b"k20".to_vec()); + region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3)]; + region.set_peers(peers.into()); + + let logger = slog_global::borrow_global().new(o!()); + let path = TempDir::new().unwrap(); + let cf_opts = DATA_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + let ctx = TabletContext::new(®ion, Some(5)); + reg.load(ctx, true).unwrap(); + + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region.clone()); + region_state.set_tablet_index(5); + + let (read_scheduler, _rx) = dummy_scheduler(); + let (reporter, _) = MockReporter::new(); + let (_tmp_dir, importer) = create_tmp_importer(); + let (ob, cmds_rx) = TestObserver::new(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_cmd_observer(0, BoxCmdObserver::new(ob)); + let mut apply = Apply::new( + &Config::default(), + region + .get_peers() + .iter() + .find(|p| p.store_id == store_id) + .unwrap() + .clone(), + region_state, + reporter, + reg, + read_scheduler, + Arc::new(FlushState::new(5)), + None, + 5, + None, + importer, + host, + logger.clone(), + ); + + let snap = Arc::new(Mutex::new(None)); + let snap_ = snap.clone(); + let (snap_cb, _) = build_any_channel(Box::new(move |args| { + let snap = args.1.take().unwrap(); + let snapshot: RegionSnapshot = match snap.downcast() { + Ok(s) => *s, + Err(t) => unreachable!("snapshot type should be the same: {:?}", t), + }; + *snap_.lock().unwrap() = Some(snapshot); + })); + + // put (k1, v1); + // capture_apply; + // put (k2, v2); + let apply_tasks = vec![ + ApplyTask::CommittedEntries(CommittedEntries { + entry_and_proposals: vec![( + new_put_entry( + region.id, + region.get_region_epoch().clone(), + b"k1", + b"v1", + 5, + 6, + ), + vec![], + )], + }), + ApplyTask::CaptureApply(CaptureChange { + observer: ChangeObserver::from_cdc(region.id, ObserveHandle::new()), + region_epoch: region.get_region_epoch().clone(), + snap_cb, + }), + ApplyTask::CommittedEntries(CommittedEntries { + entry_and_proposals: vec![( + new_put_entry( + region.id, + region.get_region_epoch().clone(), + b"k2", + b"v2", + 5, + 7, + ), + vec![], + )], + }), + ]; + + for task in apply_tasks { + match task { + ApplyTask::CommittedEntries(ce) => { + block_on(async { apply.apply_committed_entries(ce).await }); + } + ApplyTask::CaptureApply(capture_change) => { + apply.on_capture_apply(capture_change); + } + _ => unreachable!(), + } + } + apply.flush(); + + // must read (k1, v1) from snapshot and capture (k2, v2) + let snap = snap.lock().unwrap().take().unwrap(); + let v1 = snap.get_value_cf(CF_DEFAULT, b"k1").unwrap().unwrap(); + assert_eq!(v1, b"v1"); + let v2 = snap.get_value_cf(CF_DEFAULT, b"k2").unwrap(); + assert!(v2.is_none()); + + let cmds = cmds_rx.try_recv().unwrap(); + assert_eq!(cmds[0].len(), 1); + let put2 = &cmds[0].cmds[0]; + assert_eq!(put2.term, 5); + assert_eq!(put2.index, 7); + let request = &put2.request.requests[0]; + assert_eq!(request.get_put().get_cf(), CF_DEFAULT); + assert_eq!(request.get_put().get_key(), b"k2"); + assert_eq!(request.get_put().get_value(), b"v2"); + let response = &put2.response; + assert!(!response.get_header().has_error()); + } +} diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs new file mode 100644 index 00000000000..3185f1bd24b --- /dev/null +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -0,0 +1,213 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Mutex; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_cmdpb::RaftCmdRequest; +use raftstore::store::{ + can_amend_read, fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::ReadCallback, propose_read_index, should_renew_lease, util::LeaseState, ReadDelegate, + ReadIndexRequest, ReadProgress, Transport, +}; +use slog::debug; +use tikv_util::time::monotonic_raw_now; +use time::Timespec; +use tracker::GLOBAL_TRACKERS; + +use crate::{ + batch::StoreContext, + fsm::StoreMeta, + raft::Peer, + router::{QueryResChannel, QueryResult, ReadResponse}, +}; + +impl Peer { + pub(crate) fn read_index_leader( + &mut self, + ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ch: QueryResChannel, + ) { + let now = monotonic_raw_now(); + let lease_state = self.inspect_lease(); + if can_amend_read::( + self.pending_reads().back(), + &req, + lease_state, + ctx.cfg.raft_store_max_leader_lease(), + now, + ) { + // Must use the commit index of `PeerStorage` instead of the commit index + // in raft-rs which may be greater than the former one. + // For more details, see the annotations above `on_leader_commit_idx_changed`. + let commit_index = self.storage().entry_storage().commit_index(); + if let Some(read) = self.pending_reads_mut().back_mut() { + // A read request proposed in the current lease is found; combine the new + // read request to that previous one, so that no proposing needed. + read.push_command(req, ch, commit_index); + return; + } + } + + ctx.raft_metrics.propose.read_index.inc(); + + let request = req + .mut_requests() + .get_mut(0) + .filter(|req| req.has_read_index()) + .map(|req| req.take_read_index()); + let (id, dropped) = propose_read_index(self.raft_group_mut(), request.as_ref(), None); + if dropped { + // The message gets dropped silently, can't be handled anymore. + notify_stale_req(self.term(), ch); + ctx.raft_metrics.propose.dropped_read_index.inc(); + return; + } + + let mut read = ReadIndexRequest::with_command(id, req, ch, now); + read.addition_request = request.map(Box::new); + self.pending_reads_mut().push_back(read, true); + debug!( + self.logger, + "request to get a read index"; + "request_id" => ?id, + ); + + self.set_has_ready(); + // TimeoutNow has been sent out, so we need to propose explicitly to + // update leader lease. + // TODO:add following when propose is done + // if self.leader_lease.is_suspect() { + // let req = RaftCmdRequest::default(); + // if let Ok(Either::Left(index)) = self.propose_normal(ctx, req) { + // let (callback, _) = CmdResChannel::pair(); + // let p = Proposal { + // is_conf_change: false, + // index, + // term: self.term(), + // cb: callback, + // propose_time: Some(now), + // must_pass_epoch_check: false, + // }; + // + // self.post_propose(ctx, p); + // } + // } + } + + /// response the read index request + /// + /// awake the read tasks waiting in frontend (such as unified thread pool) + /// In v1, it's named as response_read. + pub(crate) fn respond_read_index( + &self, + read_index_req: &mut ReadIndexRequest, + ) { + debug!( + self.logger, + "handle reads with a read index"; + "request_id" => ?read_index_req.id, + ); + RAFT_READ_INDEX_PENDING_COUNT.sub(read_index_req.cmds().len() as i64); + let time = monotonic_raw_now(); + for (_, ch, mut read_index) in read_index_req.take_cmds().drain(..) { + ch.read_tracker().map(|tracker| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { + t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) + .to_std() + .unwrap() + .as_nanos() + as u64; + }) + }); + + // Key lock should not happen when read_index is running at the leader. + // Because it only happens when concurrent read and write requests on the same + // region on different TiKVs. + assert!(read_index_req.locked.is_none()); + match (read_index, read_index_req.read_index) { + (Some(local_responsed_index), Some(batch_index)) => { + // `read_index` could be less than `read_index_req.read_index` because the + // former is filled with `committed index` when + // proposed, and the latter is filled + // after a read-index procedure finished. + read_index = Some(std::cmp::max(local_responsed_index, batch_index)); + } + (None, _) => { + // Actually, the read_index is none if and only if it's the first one in + // read_index_req.cmds. Starting from the second, all the following ones' + // read_index is not none. + read_index = read_index_req.read_index; + } + _ => {} + } + let read_resp = ReadResponse::new(read_index.unwrap_or(0)); + ch.set_result(QueryResult::Read(read_resp)); + } + } + + /// Try to renew leader lease. + pub(crate) fn maybe_renew_leader_lease( + &mut self, + ts: Timespec, + store_meta: &Mutex>, + progress: Option, + ) { + // A nonleader peer should never has leader lease. + let read_progress = if !should_renew_lease( + self.is_leader(), + self.proposal_control().is_splitting(), + self.proposal_control().is_merging(), + self.has_force_leader(), + ) { + None + } else { + self.leader_lease_mut().renew(ts); + let term = self.term(); + self.leader_lease_mut() + .maybe_new_remote_lease(term) + .map(ReadProgress::leader_lease) + }; + if let Some(progress) = progress { + let mut meta = store_meta.lock().unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; + self.maybe_update_read_progress(reader, progress); + } + if let Some(progress) = read_progress { + let mut meta = store_meta.lock().unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; + self.maybe_update_read_progress(reader, progress); + } + } + + pub(crate) fn maybe_update_read_progress( + &self, + reader: &mut ReadDelegate, + progress: ReadProgress, + ) { + debug!( + self.logger, + "update read progress"; + "progress" => ?progress, + ); + reader.update(progress); + } + + pub(crate) fn inspect_lease(&mut self) -> LeaseState { + if !self.raft_group().raft.in_lease() { + return LeaseState::Suspect; + } + // None means now. + let state = self.leader_lease().inspect(None); + if LeaseState::Expired == state { + debug!( + self.logger, + "leader lease is expired, lease {:?}", + self.leader_lease(), + ); + // The lease is expired, call `expire` explicitly. + self.leader_lease_mut().expire(); + } + state + } +} diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs new file mode 100644 index 00000000000..f574571f790 --- /dev/null +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -0,0 +1,1021 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use std::{ + num::NonZeroU64, + ops::Deref, + sync::{atomic, Arc, Mutex}, +}; + +use batch_system::Router; +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine}; +use futures::Future; +use kvproto::{ + errorpb, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, +}; +use raftstore::{ + errors::RAFTSTORE_IS_BUSY, + store::{ + cmd_resp, + util::LeaseState, + worker_metrics::{self, TLS_LOCAL_READ_METRICS}, + LocalReaderCore, ReadDelegate, ReadExecutorProvider, RegionSnapshot, + }, + Result, +}; +use slog::{debug, Logger}; +use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; +use time::Timespec; +use txn_types::WriteBatchFlags; + +use crate::{ + fsm::StoreMeta, + router::{PeerMsg, QueryResult}, + StoreRouter, +}; + +pub trait MsgRouter: Clone + Send { + fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError>; +} + +impl MsgRouter for StoreRouter +where + EK: KvEngine, + ER: RaftEngine, +{ + fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError> { + Router::send(self, addr, msg) + } +} + +pub type ReadDelegatePair = (ReadDelegate, SharedReadTablet); + +/// A share struct for local reader. +/// +/// Though it looks like `CachedTablet`, but there are subtle differences. +/// 1. `CachedTablet` always hold the latest version of the tablet. But +/// `SharedReadTablet` should only hold the tablet that matches epoch. So it +/// will be updated only when the epoch is updated. +/// 2. `SharedReadTablet` should always hold a tablet and the same tablet. If +/// tablet is taken, then it should be considered as stale and should check +/// again epoch to load the new `SharedReadTablet`. +/// 3. `SharedReadTablet` may be cloned into thread local. So its cache should +/// be released as soon as possible, so there should be no strong reference +/// that prevents tablet from being dropped after it's marked as stale by other +/// threads. +pub struct SharedReadTablet { + tablet: Arc>>, + cache: Option, + source: bool, +} + +impl SharedReadTablet { + pub fn new(tablet: EK) -> Self { + Self { + tablet: Arc::new(Mutex::new(Some(tablet))), + cache: None, + source: true, + } + } + + /// Should call `fill_cache` first. + pub fn cache(&self) -> &EK { + self.cache.as_ref().unwrap() + } + + pub fn fill_cache(&mut self) -> bool + where + EK: Clone, + { + self.cache = self.tablet.lock().unwrap().clone(); + self.cache.is_some() + } + + pub fn release(&mut self) { + self.cache = None; + } +} + +impl Clone for SharedReadTablet { + fn clone(&self) -> Self { + Self { + tablet: Arc::clone(&self.tablet), + cache: None, + source: false, + } + } +} + +impl Drop for SharedReadTablet { + fn drop(&mut self) { + if self.source { + self.tablet.lock().unwrap().take(); + } + } +} + +enum ReadResult { + Ok(T), + Redirect, + RetryForStaleDelegate, + Err(E), +} + +fn fail_resp(msg: String) -> RaftCmdResponse { + let mut err = errorpb::Error::default(); + err.set_message(msg); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + resp +} + +#[derive(Clone)] +pub struct LocalReader +where + E: KvEngine, + C: MsgRouter, +{ + local_reader: LocalReaderCore, StoreMetaDelegate>, + router: C, + + logger: Logger, +} + +impl LocalReader +where + E: KvEngine, + C: MsgRouter, +{ + pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { + Self { + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), + router, + logger, + } + } + + pub fn store_meta(&self) -> &Arc>> { + &self.local_reader.store_meta().store_meta + } + + fn pre_propose_raft_command( + &mut self, + req: &RaftCmdRequest, + ) -> ReadResult<(CachedReadDelegate, ReadRequestPolicy)> { + let mut delegate = match self.local_reader.validate_request(req) { + Ok(Some(delegate)) => delegate, + Ok(None) => return ReadResult::Redirect, + Err(e) => return ReadResult::Err(e), + }; + + if !delegate.cached_tablet.fill_cache() { + return ReadResult::RetryForStaleDelegate; + } + let mut inspector = SnapRequestInspector { + delegate: &delegate, + logger: &self.logger, + }; + match inspector.inspect(req) { + Ok(ReadRequestPolicy::ReadLocal) => { + ReadResult::Ok((delegate, ReadRequestPolicy::ReadLocal)) + } + Ok(ReadRequestPolicy::StaleRead) => { + ReadResult::Ok((delegate, ReadRequestPolicy::StaleRead)) + } + // It can not handle other policies. + // TODO: we should only abort when lease expires. For other cases we should retry + // infinitely. + Ok(ReadRequestPolicy::ReadIndex) => ReadResult::Redirect, + Err(e) => ReadResult::Err(e), + } + } + + fn try_get_snapshot( + &mut self, + req: &RaftCmdRequest, + ) -> ReadResult, RaftCmdResponse> { + match self.pre_propose_raft_command(req) { + ReadResult::Ok((mut delegate, policy)) => { + let mut snap = match policy { + ReadRequestPolicy::ReadLocal => { + let region = Arc::clone(&delegate.region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + let snapshot_ts = monotonic_raw_now(); + + if !delegate.is_in_leader_lease(snapshot_ts) { + return ReadResult::Redirect; + } + + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_requests.inc()); + + // Try renew lease in advance + self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); + snap + } + ReadRequestPolicy::StaleRead => { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } + + let region = Arc::clone(&delegate.region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); + + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_requests.inc()); + + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } + + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + snap + } + _ => unreachable!(), + }; + + snap.txn_ext = Some(delegate.txn_ext.clone()); + snap.term = NonZeroU64::new(delegate.term); + snap.txn_extra_op = delegate.txn_extra_op.load(); + snap.bucket_meta = delegate.bucket_meta.clone(); + + delegate.cached_tablet.release(); + + ReadResult::Ok(snap) + } + ReadResult::Err(e) => { + let mut response = cmd_resp::new_error(e); + if let Some(delegate) = self + .local_reader + .delegates + .get(&req.get_header().get_region_id()) + { + cmd_resp::bind_term(&mut response, delegate.term); + } + ReadResult::Err(response) + } + ReadResult::Redirect => ReadResult::Redirect, + ReadResult::RetryForStaleDelegate => ReadResult::RetryForStaleDelegate, + } + } + + pub fn snapshot( + &mut self, + mut req: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send + { + let region_id = req.header.get_ref().region_id; + let mut tried_cnt = 0; + let res = loop { + let res = self.try_get_snapshot(&req); + match res { + ReadResult::Ok(snap) => break Either::Left(Ok(snap)), + ReadResult::Err(e) => break Either::Left(Err(e)), + ReadResult::Redirect => { + break Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())); + } + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + break Either::Left(Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", + region_id + )))); + } + } + }; + + worker_metrics::maybe_tls_local_read_metrics_flush(); + + async move { + let (mut fut, mut reader) = match res { + Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Err(e)) => return Err(e), + Either::Right((fut, reader)) => (fut, reader), + }; + + let mut tried_cnt = 0; + loop { + match fut.await? { + Some(query_res) => { + if query_res.read().is_none() { + let QueryResult::Response(res) = query_res else { unreachable!() }; + assert!(res.get_header().has_error(), "{:?}", res); + return Err(res); + } + } + None => { + return Err(fail_resp(format!( + "internal error: failed to extend lease: canceled: {}", + region_id + ))); + } + } + + // If query successful, try again. + req.mut_header().set_read_quorum(false); + loop { + let r = reader.try_get_snapshot(&req); + match r { + ReadResult::Ok(snap) => return Ok(snap), + ReadResult::Err(e) => return Err(e), + ReadResult::Redirect => { + tried_cnt += 1; + if tried_cnt < 10 { + fut = reader.try_to_renew_lease(region_id, &req); + break; + } + return Err(fail_resp(format!( + "internal error: can't handle msg in local reader for {}", + region_id + ))); + } + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + return Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", + region_id + ))); + } + } + } + } + } + } + + // try to renew the lease by sending read query where the reading process may + // renew the lease + fn try_to_renew_lease( + &self, + region_id: u64, + req: &RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> { + let mut req = req.clone(); + // Remote lease is updated step by step. It's possible local reader expires + // while the raftstore doesn't. So we need to trigger an update + // explicitly. TODO: find a way to reduce the triggered heartbeats. + req.mut_header().set_read_quorum(true); + let (msg, sub) = PeerMsg::raft_query(req); + let res = match MsgRouter::send(&self.router, region_id, msg) { + Ok(()) => Ok(sub), + Err(TrySendError::Full(_)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); + let mut err = errorpb::Error::default(); + err.set_message(RAFTSTORE_IS_BUSY.to_owned()); + err.mut_server_is_busy() + .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + Err(err) + } + Err(TrySendError::Disconnected(_)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + let mut err = errorpb::Error::default(); + err.set_message(format!("region {} is missing", region_id)); + err.mut_region_not_found().set_region_id(region_id); + Err(err) + } + }; + + async move { + match res { + Ok(sub) => Ok(sub.result().await), + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e); + Err(resp) + } + } + } + } + + // If the remote lease will be expired in near future send message + // to `raftstore` to renew it + fn maybe_renew_lease_in_advance( + &self, + delegate: &ReadDelegate, + req: &RaftCmdRequest, + ts: Timespec, + ) { + if !delegate.need_renew_lease(ts) { + return; + } + + let region_id = req.header.get_ref().region_id; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); + // Send a read query which may renew the lease + let msg = PeerMsg::raft_query(req.clone()).0; + if let Err(e) = MsgRouter::send(&self.router, region_id, msg) { + debug!( + self.logger, + "failed to send query for trying to renew lease"; + "region" => region_id, + "error" => ?e + ) + } + } +} + +/// CachedReadDelegate is a wrapper the ReadDelegate and CachedTablet. +/// CachedTablet can fetch the latest tablet of this ReadDelegate's region. The +/// main purpose of this wrapping is to implement ReadExecutor where the latest +/// tablet is needed. +pub struct CachedReadDelegate +where + E: KvEngine, +{ + // The reason for this to be Arc, see the comment on get_delegate in + // raftstore/src/store/worker/read.rs + delegate: Arc, + cached_tablet: SharedReadTablet, +} + +impl Deref for CachedReadDelegate +where + E: KvEngine, +{ + type Target = ReadDelegate; + + fn deref(&self) -> &Self::Target { + self.delegate.as_ref() + } +} + +impl Clone for CachedReadDelegate +where + E: KvEngine, +{ + fn clone(&self) -> Self { + CachedReadDelegate { + delegate: Arc::clone(&self.delegate), + cached_tablet: self.cached_tablet.clone(), + } + } +} + +#[derive(Clone)] +struct StoreMetaDelegate +where + E: KvEngine, +{ + store_meta: Arc>>, +} + +impl StoreMetaDelegate +where + E: KvEngine, +{ + pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta } + } +} + +impl ReadExecutorProvider for StoreMetaDelegate +where + E: KvEngine, +{ + type Executor = CachedReadDelegate; + type StoreMeta = Arc>>; + + fn store_id(&self) -> Option { + Some(self.store_meta.as_ref().lock().unwrap().store_id) + } + + /// get the ReadDelegate with region_id and the number of delegates in the + /// StoreMeta + fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { + let meta = self.store_meta.as_ref().lock().unwrap(); + let reader = meta.readers.get(®ion_id).cloned(); + if let Some((reader, read_tablet)) = reader { + // If reader is not None, cache must not be None. + return ( + meta.readers.len(), + Some(CachedReadDelegate { + delegate: Arc::new(reader), + cached_tablet: read_tablet, + }), + ); + } + (meta.readers.len(), None) + } +} + +enum ReadRequestPolicy { + StaleRead, + ReadLocal, + ReadIndex, +} + +struct SnapRequestInspector<'r> { + delegate: &'r ReadDelegate, + logger: &'r Logger, +} + +impl<'r> SnapRequestInspector<'r> { + fn inspect(&mut self, req: &RaftCmdRequest) -> Result { + assert!(!req.has_admin_request()); + if req.get_requests().len() != 1 + || req.get_requests().first().unwrap().get_cmd_type() != CmdType::Snap + { + return Err(box_err!( + "LocalReader can only serve for exactly one Snap request" + )); + } + + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); + if flags.contains(WriteBatchFlags::STALE_READ) { + return Ok(ReadRequestPolicy::StaleRead); + } + + if req.get_header().get_read_quorum() { + return Ok(ReadRequestPolicy::ReadIndex); + } + + // If applied index's term differs from current raft's term, leader transfer + // must happened, if read locally, we may read old value. + if !self.has_applied_to_current_term() { + return Ok(ReadRequestPolicy::ReadIndex); + } + + // Local read should be performed, if and only if leader is in lease. + // None for now. + match self.inspect_lease() { + LeaseState::Valid => Ok(ReadRequestPolicy::ReadLocal), + LeaseState::Expired | LeaseState::Suspect => { + // Perform a consistent read to Raft quorum and try to renew the leader lease. + Ok(ReadRequestPolicy::ReadIndex) + } + } + } + + fn has_applied_to_current_term(&mut self) -> bool { + if self.delegate.applied_term == self.delegate.term { + true + } else { + debug!( + self.logger, + "rejected by term check"; + "tag" => &self.delegate.tag, + "applied_term" => self.delegate.applied_term, + "delegate_term" => ?self.delegate.term, + ); + + // only for metric. + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.applied_term.inc()); + false + } + } + + fn inspect_lease(&mut self) -> LeaseState { + // TODO: disable localreader if we did not enable raft's check_quorum. + if self.delegate.leader_lease.is_some() { + // We skip lease check, because it is postponed until `handle_read`. + LeaseState::Valid + } else { + debug!(self.logger, "rejected by leader lease"; "tag" => &self.delegate.tag); + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_lease.inc()); + LeaseState::Expired + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + cell::Cell, + sync::mpsc::*, + thread::{self, JoinHandle}, + }; + + use collections::HashSet; + use crossbeam::{atomic::AtomicCell, channel::TrySendError}; + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactory}, + }; + use engine_traits::{MiscExt, SyncMutable, TabletContext, TabletRegistry, DATA_CFS}; + use futures::executor::block_on; + use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; + use pd_client::BucketMeta; + use raftstore::store::{ + util::Lease, worker_metrics::TLS_LOCAL_READ_METRICS, ReadCallback, ReadProgress, + RegionReadProgress, TrackVer, TxnExt, + }; + use slog::o; + use tempfile::Builder; + use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; + use time::Duration; + use txn_types::WriteBatchFlags; + + use super::*; + use crate::router::{QueryResult, ReadResponse}; + + #[derive(Clone)] + struct MockRouter { + p_router: SyncSender<(u64, PeerMsg)>, + addresses: Arc>>, + } + + impl MockRouter { + fn new(addresses: Arc>>) -> (MockRouter, Receiver<(u64, PeerMsg)>) { + let (p_ch, p_rx) = sync_channel(1); + ( + MockRouter { + p_router: p_ch, + addresses, + }, + p_rx, + ) + } + } + + impl MsgRouter for MockRouter { + fn send(&self, addr: u64, cmd: PeerMsg) -> std::result::Result<(), TrySendError> { + if !self.addresses.lock().unwrap().contains(&addr) { + return Err(TrySendError::Disconnected(cmd)); + } + self.p_router.send((addr, cmd)).unwrap(); + Ok(()) + } + } + + #[allow(clippy::type_complexity)] + fn new_reader( + store_id: u64, + store_meta: Arc>>, + addresses: Arc>>, + ) -> ( + LocalReader, + Receiver<(u64, PeerMsg)>, + ) { + let (ch, rx) = MockRouter::new(addresses); + let mut reader = LocalReader::new( + store_meta, + ch, + Logger::root(slog::Discard, o!("key1" => "value1")), + ); + reader.local_reader.store_id = Cell::new(Some(store_id)); + (reader, rx) + } + + fn new_peers(store_id: u64, pr_ids: Vec) -> Vec { + pr_ids + .into_iter() + .map(|id| { + let mut pr = metapb::Peer::default(); + pr.set_store_id(store_id); + pr.set_id(id); + pr + }) + .collect() + } + + // It mocks that local reader communications with raftstore. + // mix_rx receives a closure, msg receiver, and sender of the msg receiver + // - closure: do some update such as renew lease or something which we could do + // in real raftstore + // - msg receiver: receives the msg from local reader + // - sender of the msg receiver: send the msg receiver out of the thread so that + // we can use it again. + fn mock_raftstore( + mix_rx: Receiver<( + Box, + Receiver<(u64, PeerMsg)>, + SyncSender>, + )>, + ) -> JoinHandle<()> { + thread::spawn(move || { + while let Ok((f, rx, ch_tx)) = mix_rx.recv() { + // Receives msg from local reader + let (_, msg) = rx.recv().unwrap(); + f(); + + match msg { + // send the result back to local reader + PeerMsg::RaftQuery(query) => { + assert!(query.request.get_header().get_read_quorum()); + ReadCallback::set_result( + query.ch, + QueryResult::Read(ReadResponse { + read_index: 0, + txn_extra_op: Default::default(), + }), + ) + } + _ => unreachable!(), + } + ch_tx.send(rx).unwrap(); + } + }) + } + + #[test] + fn test_read() { + let store_id = 1; + + // Building a tablet factory + let ops = DbOptions::default(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let path = Builder::new() + .prefix("test-local-reader") + .tempdir() + .unwrap(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + + let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); + let addresses: Arc>> = Arc::default(); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), addresses.clone()); + let (mix_tx, mix_rx) = sync_channel(1); + let handler = mock_raftstore(mix_rx); + + let mut region1 = metapb::Region::default(); + region1.set_id(1); + let prs = new_peers(store_id, vec![1, 2, 3]); + region1.set_peers(prs.clone().into()); + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let leader2 = prs[0].clone(); + region1.set_region_epoch(epoch13.clone()); + let term6 = 6; + let mut lease = Lease::new(Duration::seconds(10), Duration::milliseconds(2500)); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); + + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader2); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // The region is not register yet. + let res = block_on(reader.snapshot(cmd.clone())).unwrap_err(); + assert!( + res.header + .as_ref() + .unwrap() + .get_error() + .has_region_not_found() + ); + // No msg will ben sent + rx.try_recv().unwrap_err(); + // It will be rejected first when processing local, and then rejected when + // trying to forward to raftstore. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), + 2 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 1 + ); + assert!(reader.local_reader.delegates.get(&1).is_none()); + + // Register region 1 + lease.renew(monotonic_raw_now()); + let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let txn_ext = Arc::new(TxnExt::default()); + let bucket_meta = Arc::new(BucketMeta::default()); + { + let mut meta = store_meta.as_ref().lock().unwrap(); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate { + tag: String::new(), + region: Arc::new(region1.clone()), + peer_id: 1, + term: term6, + applied_term: term6 - 1, + leader_lease: Some(remote), + last_valid_ts: Timespec::new(0, 0), + txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), + txn_ext: txn_ext.clone(), + read_progress: read_progress.clone(), + pending_remove: false, + wait_data: false, + track_ver: TrackVer::new(), + bucket_meta: Some(bucket_meta.clone()), + }; + // create tablet with region_id 1 and prepare some data + let ctx = TabletContext::new(®ion1, Some(10)); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + meta.readers.insert(1, (read_delegate, shared)); + } + + let (ch_tx, ch_rx) = sync_channel(1); + + // Case: Applied term not match + let store_meta_clone = store_meta.clone(); + // Send what we want to do to mock raftstore + mix_tx + .send(( + Box::new(move || { + let mut meta = store_meta_clone.lock().unwrap(); + meta.readers + .get_mut(&1) + .unwrap() + .0 + .update(ReadProgress::applied_term(term6)); + }), + rx, + ch_tx.clone(), + )) + .unwrap(); + // The first try will be rejected due to unmatched applied term but after update + // the applied term by the above thread, the snapshot will be acquired by + // retrying. + addresses.lock().unwrap().insert(1); + let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + assert!(Arc::ptr_eq(snap.txn_ext.as_ref().unwrap(), &txn_ext)); + assert!(Arc::ptr_eq( + snap.bucket_meta.as_ref().unwrap(), + &bucket_meta + )); + assert_eq!(*snap.get_region(), region1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 3 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.applied_term.get()), + 1 + ); + rx = ch_rx.recv().unwrap(); + + // Case: Expire lease to make the local reader lease check fail. + lease.expire_remote_lease(); + let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let meta = store_meta.clone(); + // Send what we want to do to mock raftstore + mix_tx + .send(( + Box::new(move || { + let mut meta = meta.lock().unwrap(); + meta.readers + .get_mut(&1) + .unwrap() + .0 + .update(ReadProgress::leader_lease(remote)); + }), + rx, + ch_tx.clone(), + )) + .unwrap(); + block_on(reader.snapshot(cmd.clone())).unwrap(); + // Updating lease makes cache miss. And because the cache is updated on cloned + // copy, so the old cache will still need to be updated again. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 5 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); + rx = ch_rx.recv().unwrap(); + + // Case: Tablet miss should triger retry. + { + let ctx = TabletContext::new(®ion1, Some(15)); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + let mut meta = store_meta.lock().unwrap(); + meta.readers.get_mut(&1).unwrap().1 = shared; + } + block_on(reader.snapshot(cmd.clone())).unwrap(); + // Tablet miss should trigger reload tablet, so cache miss should increase. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 6 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); + + // Case: Read quorum. + let mut cmd_read_quorum = cmd.clone(); + cmd_read_quorum.mut_header().set_read_quorum(true); + mix_tx.send((Box::new(move || {}), rx, ch_tx)).unwrap(); + let _ = block_on(reader.snapshot(cmd_read_quorum.clone())).unwrap(); + ch_rx.recv().unwrap(); + + // Case: Stale read + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 0 + ); + read_progress.update_safe_ts(1, 1); + assert_eq!(read_progress.safe_ts(), 1); + let data = { + let mut d = [0u8; 8]; + (&mut d[..]).encode_u64(2).unwrap(); + d + }; + cmd.mut_header() + .set_flags(WriteBatchFlags::STALE_READ.bits()); + cmd.mut_header().set_flag_data(data.into()); + let res = block_on(reader.snapshot(cmd.clone())).unwrap_err(); + assert!(res.get_header().get_error().has_data_is_not_ready()); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 1 + ); + read_progress.update_safe_ts(1, 2); + assert_eq!(read_progress.safe_ts(), 2); + let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + assert_eq!(*snap.get_region(), region1); + assert_eq!(snap.term, NonZeroU64::new(term6)); + + drop(mix_tx); + handler.join().unwrap(); + } + + #[test] + fn test_read_delegate() { + // Building a tablet factory + let ops = DbOptions::default(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let path = Builder::new() + .prefix("test-local-reader") + .tempdir() + .unwrap(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + + let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1)))); + + let tablet1; + let tablet2; + { + let mut meta = store_meta.store_meta.as_ref().lock().unwrap(); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate::mock(1); + + // create tablet with region_id 1 and prepare some data + let mut ctx = TabletContext::with_infinite_region(1, Some(10)); + reg.load(ctx, true).unwrap(); + tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); + tablet1.put(b"a1", b"val1").unwrap(); + let shared1 = SharedReadTablet::new(tablet1.clone()); + meta.readers.insert(1, (read_delegate, shared1)); + + // Create read_delegate with region id 2 + let read_delegate = ReadDelegate::mock(2); + + // create tablet with region_id 1 and prepare some data + ctx = TabletContext::with_infinite_region(2, Some(10)); + reg.load(ctx, true).unwrap(); + tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); + tablet2.put(b"a2", b"val2").unwrap(); + let shared2 = SharedReadTablet::new(tablet2.clone()); + meta.readers.insert(2, (read_delegate, shared2)); + } + + let (_, delegate) = store_meta.get_executor_and_len(1); + let mut delegate = delegate.unwrap(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); + assert_eq!(tablet1.path(), tablet.path()); + let path1 = tablet.path().to_owned(); + delegate.cached_tablet.release(); + + let (_, delegate) = store_meta.get_executor_and_len(2); + let mut delegate = delegate.unwrap(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); + assert_eq!(tablet2.path(), tablet.path()); + + assert!(KvTestEngine::locked(&path1).unwrap()); + drop(tablet1); + drop(reg); + assert!(KvTestEngine::locked(&path1).unwrap()); + store_meta.store_meta.lock().unwrap().readers.remove(&1); + assert!(!KvTestEngine::locked(&path1).unwrap()); + } +} diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs new file mode 100644 index 00000000000..81fb4e5e9de --- /dev/null +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -0,0 +1,450 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! There are two types of Query: KV read and status query. +//! +//! KV Read is implemented in local module and lease module. +//! Read will be executed in callee thread if in lease, which is +//! implemented in local module. If lease is expired, it will extend the lease +//! first. Lease maintainance is implemented in lease module. +//! +//! Status query is implemented in the root module directly. +//! Follower's read index and replica read is implemenented replica module. +//! Leader's read index and lease renew is implemented in lease module. + +use std::cmp; + +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + errorpb, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, StatusCmdType}, +}; +use raft::{Ready, StateRole}; +use raftstore::{ + errors::RAFTSTORE_IS_BUSY, + store::{ + cmd_resp, local_metrics::RaftMetrics, metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::ErrorCallback, region_meta::RegionMeta, util, util::LeaseState, GroupState, + ReadIndexContext, ReadProgress, RequestPolicy, Transport, + }, + Error, Result, +}; +use slog::{debug, info}; +use tikv_util::{box_err, log::SlogFormat}; +use txn_types::WriteBatchFlags; + +use crate::{ + batch::StoreContext, + fsm::PeerFsmDelegate, + raft::Peer, + router::{ + message::RaftRequest, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, ReadResponse, + }, +}; + +mod capture; +mod lease; +mod local; +mod replica; + +pub(crate) use self::local::{LocalReader, ReadDelegatePair, SharedReadTablet}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> + PeerFsmDelegate<'a, EK, ER, T> +{ + fn inspect_read(&mut self, req: &RaftCmdRequest) -> Result { + if req.get_header().get_read_quorum() { + return Ok(RequestPolicy::ReadIndex); + } + + // If applied index's term is differ from current raft's term, leader transfer + // must happened, if read locally, we may read old value. + if !self.fsm.peer().applied_to_current_term() { + return Ok(RequestPolicy::ReadIndex); + } + + match self.fsm.peer_mut().inspect_lease() { + LeaseState::Valid => Ok(RequestPolicy::ReadLocal), + LeaseState::Expired | LeaseState::Suspect => { + // Perform a consistent read to Raft quorum and try to renew the leader lease. + Ok(RequestPolicy::ReadIndex) + } + } + } + + #[inline] + pub fn on_query(&mut self, req: RaftCmdRequest, ch: QueryResChannel) { + if !req.has_status_request() { + if let Err(e) = self + .fsm + .peer_mut() + .validate_query_msg(&req, &mut self.store_ctx.raft_metrics) + { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + let policy = self.inspect_read(&req); + match policy { + Ok(RequestPolicy::ReadIndex) => { + self.fsm.peer_mut().read_index(self.store_ctx, req, ch); + } + Ok(RequestPolicy::ReadLocal) => { + self.store_ctx.raft_metrics.propose.local_read.inc(); + let read_resp = ReadResponse::new(0); + ch.set_result(QueryResult::Read(read_resp)); + } + _ => { + panic!("inspect_read is expected to only return ReadIndex or ReadLocal"); + } + }; + } else { + self.fsm.peer_mut().on_query_status(&req, ch); + } + } +} + +impl Peer { + fn validate_query_msg( + &mut self, + msg: &RaftCmdRequest, + raft_metrics: &mut RaftMetrics, + ) -> Result<()> { + // check query specific requirements + if msg.has_admin_request() { + return Err(box_err!("PeerMsg::RaftQuery does not allow admin requests")); + } + + // check query specific requirements + for r in msg.get_requests() { + if r.get_cmd_type() != CmdType::Get + && r.get_cmd_type() != CmdType::Snap + && r.get_cmd_type() != CmdType::ReadIndex + { + return Err(box_err!( + "PeerMsg::RaftQuery does not allow write requests: {:?}", + r.get_cmd_type() + )); + } + } + + // Check store_id, make sure that the msg is dispatched to the right place. + if let Err(e) = util::check_store_id(msg.get_header(), self.peer().get_store_id()) { + raft_metrics.invalid_proposal.mismatch_store_id.inc(); + return Err(e); + } + + let flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); + if flags.contains(WriteBatchFlags::STALE_READ) { + return Err(box_err!( + "PeerMsg::RaftQuery should not get stale read requests" + )); + } + + // TODO: add flashback_state check + + // Check whether the store has the right peer to handle the request. + let request = msg.get_requests(); + + // TODO: add force leader + + // ReadIndex can be processed on the replicas. + let is_read_index_request = + request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; + + let allow_replica_read = msg.get_header().get_replica_read(); + if !self.is_leader() && !is_read_index_request && !allow_replica_read { + raft_metrics.invalid_proposal.not_leader.inc(); + return Err(Error::NotLeader(self.region_id(), self.leader())); + } + + // peer_id must be the same as peer's. + if let Err(e) = util::check_peer_id(msg.get_header(), self.peer_id()) { + raft_metrics.invalid_proposal.mismatch_peer_id.inc(); + return Err(e); + } + + // TODO: check applying snapshot + + // Check whether the term is stale. + if let Err(e) = util::check_term(msg.get_header(), self.term()) { + raft_metrics.invalid_proposal.stale_command.inc(); + return Err(e); + } + + // TODO: add check of sibling region for split + util::check_req_region_epoch(msg, self.region(), true) + } + + // For these cases it won't be proposed: + // 1. The region is in merging or splitting; + // 2. The message is stale and dropped by the Raft group internally; + // 3. There is already a read request proposed in the current lease; + fn read_index( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ch: QueryResChannel, + ) { + // TODO: add pre_read_index to handle splitting or merging + if self.is_leader() { + self.read_index_leader(ctx, req, ch); + } else { + self.read_index_follower(ctx, req, ch); + } + } + + pub(crate) fn apply_reads(&mut self, ctx: &mut StoreContext, ready: &Ready) { + let states = ready.read_states().iter().map(|state| { + let read_index_ctx = ReadIndexContext::parse(state.request_ctx.as_slice()).unwrap(); + (read_index_ctx.id, read_index_ctx.locked, state.index) + }); + // The follower may lost `ReadIndexResp`, so the pending_reads does not + // guarantee the orders are consistent with read_states. `advance` will + // update the `read_index` of read request that before this successful + // `ready`. + if !self.is_leader() { + // NOTE: there could still be some pending reads proposed by the peer when it + // was leader. They will be cleared in `clear_uncommitted_on_role_change` later + // in the function. + self.pending_reads_mut().advance_replica_reads(states); + self.post_pending_read_index_on_replica(ctx); + } else { + self.pending_reads_mut().advance_leader_reads(states); + if let Some(propose_time) = self.pending_reads().last_ready().map(|r| r.propose_time) { + if !self.leader_lease_mut().is_suspect() { + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); + } + } + + if self.ready_to_handle_read() { + while let Some(mut read) = self.pending_reads_mut().pop_front() { + self.respond_read_index(&mut read); + } + } + } + + // Note that only after handle read_states can we identify what requests are + // actually stale. + if ready.ss().is_some() { + let term = self.term(); + // all uncommitted reads will be dropped silently in raft. + self.pending_reads_mut() + .clear_uncommitted_on_role_change(term); + } + } + + /// Respond to the ready read index request on the replica, the replica is + /// not a leader. + fn post_pending_read_index_on_replica(&mut self, ctx: &mut StoreContext) { + while let Some(mut read) = self.pending_reads_mut().pop_front() { + // The response of this read index request is lost, but we need it for + // the memory lock checking result. Resend the request. + if let Some(read_index) = read.addition_request.take() { + assert_eq!(read.cmds().len(), 1); + let (mut req, ch, _) = read.take_cmds().pop().unwrap(); + assert_eq!(req.requests.len(), 1); + req.requests[0].set_read_index(*read_index); + let read_cmd = RaftRequest::new(req, ch); + info!( + self.logger, + "re-propose read index request because the response is lost"; + ); + RAFT_READ_INDEX_PENDING_COUNT.sub(1); + self.send_read_command(ctx, read_cmd); + continue; + } + + assert!(read.read_index.is_some()); + let is_read_index_request = read.cmds().len() == 1 + && read.cmds()[0].0.get_requests().len() == 1 + && read.cmds()[0].0.get_requests()[0].get_cmd_type() == CmdType::ReadIndex; + + if is_read_index_request { + self.respond_read_index(&mut read); + } else if self.ready_to_handle_unsafe_replica_read(read.read_index.unwrap()) { + self.respond_replica_read(&mut read); + } else { + // TODO: `ReadIndex` requests could be blocked. + self.pending_reads_mut().push_front(read); + break; + } + } + } + + // Note: comparing with v1, it removes the snapshot check because in v2 the + // snapshot will not delete the data anymore. + fn ready_to_handle_unsafe_replica_read(&self, read_index: u64) -> bool { + // Wait until the follower applies all values before the read. There is still a + // problem if the leader applies fewer values than the follower, the follower + // read could get a newer value, and after that, the leader may read a stale + // value, which violates linearizability. + self.storage().apply_state().get_applied_index() >= read_index + // If it is in pending merge state(i.e. applied PrepareMerge), the data may be stale. + // TODO: Add a test to cover this case + && self.proposal_control().has_applied_prepare_merge() + } + + #[inline] + pub fn ready_to_handle_read(&self) -> bool { + // TODO: It may cause read index to wait a long time. + + // There may be some values that are not applied by this leader yet but the old + // leader, if applied_term isn't equal to current term. + self.applied_to_current_term() + // There may be stale read if the old leader splits really slow, + // the new region may already elected a new leader while + // the old leader still think it owns the split range. + && !self.proposal_control().is_splitting() + // There may be stale read if a target leader is in another store and + // applied commit merge, written new values, but the sibling peer in + // this store does not apply commit merge, so the leader is not ready + // to read, until the merge is rollbacked. + && !self.proposal_control().is_merging() + } + + fn send_read_command( + &self, + ctx: &mut StoreContext, + read_cmd: RaftRequest, + ) { + let mut err = errorpb::Error::default(); + let region_id = read_cmd.request.get_header().get_region_id(); + let read_ch = match ctx.router.send(region_id, PeerMsg::RaftQuery(read_cmd)) { + Ok(()) => return, + Err(TrySendError::Full(PeerMsg::RaftQuery(cmd))) => { + err.set_message(RAFTSTORE_IS_BUSY.to_owned()); + err.mut_server_is_busy() + .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + cmd.ch + } + Err(TrySendError::Disconnected(PeerMsg::RaftQuery(cmd))) => { + err.set_message(format!("region {} is missing", self.region_id())); + err.mut_region_not_found().set_region_id(self.region_id()); + cmd.ch + } + _ => unreachable!(), + }; + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + read_ch.report_error(resp); + } + + /// Status command is used to query target region information. + #[inline] + fn on_query_status(&mut self, req: &RaftCmdRequest, ch: QueryResChannel) { + let mut response = RaftCmdResponse::default(); + if let Err(e) = self.query_status(req, &mut response) { + cmd_resp::bind_error(&mut response, e); + } + ch.set_result(QueryResult::Response(response)); + } + + fn query_status(&mut self, req: &RaftCmdRequest, resp: &mut RaftCmdResponse) -> Result<()> { + util::check_store_id(req.get_header(), self.peer().get_store_id())?; + let cmd_type = req.get_status_request().get_cmd_type(); + let status_resp = resp.mut_status_response(); + status_resp.set_cmd_type(cmd_type); + match cmd_type { + StatusCmdType::RegionLeader => { + if let Some(leader) = self.leader() { + status_resp.mut_region_leader().set_leader(leader); + } + } + StatusCmdType::RegionDetail => { + if !self.storage().is_initialized() { + let region_id = req.get_header().get_region_id(); + return Err(Error::RegionNotInitialized(region_id)); + } + status_resp + .mut_region_detail() + .set_region(self.region().clone()); + if let Some(leader) = self.leader() { + status_resp.mut_region_detail().set_leader(leader); + } + } + StatusCmdType::InvalidStatus => { + return Err(box_err!( + "{} invalid status command!", + SlogFormat(&self.logger) + )); + } + } + + // Bind peer current term here. + cmd_resp::bind_term(resp, self.term()); + Ok(()) + } + + /// Query internal states for debugging purpose. + pub fn on_query_debug_info(&self, ch: DebugInfoChannel) { + let entry_storage = self.storage().entry_storage(); + let mut status = self.raft_group().status(); + status + .progress + .get_or_insert_with(|| self.raft_group().raft.prs()); + let mut meta = RegionMeta::new( + self.storage().region_state(), + entry_storage.apply_state(), + GroupState::Ordered, + status, + self.raft_group().raft.raft_log.last_index(), + self.raft_group().raft.raft_log.persisted, + ); + // V2 doesn't persist commit index and term, fill them with in-memory values. + meta.raft_apply.commit_index = cmp::min( + self.raft_group().raft.raft_log.committed, + self.persisted_index(), + ); + meta.raft_apply.commit_term = self + .raft_group() + .raft + .raft_log + .term(meta.raft_apply.commit_index) + .unwrap(); + if let Some(bucket_stats) = self.region_buckets_info().bucket_stat() { + meta.bucket_keys = bucket_stats.meta.keys.clone(); + } + debug!(self.logger, "on query debug info"; + "tick" => self.raft_group().raft.election_elapsed, + "election_timeout" => self.raft_group().raft.randomized_election_timeout(), + ); + ch.set_result(meta); + } + + // the v1's post_apply + // As the logic is mostly for read, rename it to handle_read_after_apply + pub fn handle_read_on_apply( + &mut self, + ctx: &mut StoreContext, + applied_term: u64, + applied_index: u64, + progress_to_be_updated: bool, + ) { + // TODO: add is_handling_snapshot check + // it could update has_ready + + // TODO: add peer_stat(for PD hotspot scheduling) and deleted_keys_hint + if !self.is_leader() { + self.post_pending_read_index_on_replica(ctx) + } else if self.ready_to_handle_read() { + while let Some(mut read) = self.pending_reads_mut().pop_front() { + self.respond_read_index(&mut read); + } + } + self.pending_reads_mut().gc(); + self.read_progress_mut().update_applied_core(applied_index); + + // Only leaders need to update applied_term. + if progress_to_be_updated && self.is_leader() { + if applied_term == self.term() { + ctx.coprocessor_host + .on_applied_current_term(StateRole::Leader, self.region()); + } + let progress = ReadProgress::applied_term(applied_term); + let mut meta = ctx.store_meta.lock().unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; + self.maybe_update_read_progress(reader, progress); + } + } +} diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs new file mode 100644 index 00000000000..901fd9726f6 --- /dev/null +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -0,0 +1,107 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_cmdpb::{self, RaftCmdRequest, RaftCmdResponse}; +use pd_client::INVALID_ID; +use raftstore::{ + store::{ + cmd_resp, + fsm::apply::notify_stale_req, + metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::{ErrorCallback, ReadCallback}, + propose_read_index, ReadIndexRequest, Transport, + }, + Error, +}; +use slog::debug; +use tikv_util::time::monotonic_raw_now; +use tracker::GLOBAL_TRACKERS; + +use crate::{ + batch::StoreContext, + raft::Peer, + router::{QueryResChannel, QueryResult, ReadResponse}, +}; +impl Peer { + /// read index on follower + /// + /// call set_has_ready if it's proposed. + pub(crate) fn read_index_follower( + &mut self, + ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ch: QueryResChannel, + ) { + if self.leader_id() == INVALID_ID { + ctx.raft_metrics.invalid_proposal.read_index_no_leader.inc(); + let mut err_resp = RaftCmdResponse::default(); + let term = self.term(); + cmd_resp::bind_term(&mut err_resp, term); + cmd_resp::bind_error(&mut err_resp, Error::NotLeader(self.region_id(), None)); + ch.report_error(err_resp); + return; + } + + ctx.raft_metrics.propose.read_index.inc(); + + let request = req + .mut_requests() + .get_mut(0) + .filter(|req| req.has_read_index()) + .map(|req| req.take_read_index()); + let (id, _dropped) = propose_read_index(self.raft_group_mut(), request.as_ref(), None); + let now = monotonic_raw_now(); + let mut read = ReadIndexRequest::with_command(id, req, ch, now); + read.addition_request = request.map(Box::new); + self.pending_reads_mut().push_back(read, false); + debug!( + self.logger, + "request to get a read index from follower"; + "request_id" => ?id, + ); + self.set_has_ready(); + } + + pub(crate) fn respond_replica_read( + &self, + read_index_req: &mut ReadIndexRequest, + ) { + debug!( + self.logger, + "handle replica reads with a read index"; + "request_id" => ?read_index_req.id, + ); + RAFT_READ_INDEX_PENDING_COUNT.sub(read_index_req.cmds().len() as i64); + let time = monotonic_raw_now(); + for (req, ch, _) in read_index_req.take_cmds().drain(..) { + ch.read_tracker().map(|tracker| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { + t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) + .to_std() + .unwrap() + .as_nanos() + as u64; + }) + }); + + // leader reports key is locked + if let Some(locked) = read_index_req.locked.take() { + let mut response = raft_cmdpb::Response::default(); + response.mut_read_index().set_locked(*locked); + let mut cmd_resp = RaftCmdResponse::default(); + cmd_resp.mut_responses().push(response); + ch.report_error(cmd_resp); + continue; + } + if req.get_header().get_replica_read() { + let read_resp = ReadResponse::new(read_index_req.read_index.unwrap_or(0)); + ch.set_result(QueryResult::Read(read_resp)); + } else { + // The request could be proposed when the peer was leader. + // TODO: figure out that it's necessary to notify stale or not. + let term = self.term(); + notify_stale_req(term, ch); + } + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs new file mode 100644 index 00000000000..6c9c73479ba --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -0,0 +1,703 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! In raftstore v2, WAL is always disabled for tablet. So we need a way to +//! trace what have been persisted what haven't, and recover those missing +//! data when restart. +//! +//! In summary, we trace the persist progress by recording flushed event. +//! Because memtable is flushed one by one, so a flushed memtable must contain +//! all the data within the CF before certain apply index. So the minimun +//! flushed apply index + 1 of all data CFs is the recovery start point. In +//! some cases, a CF may not have any updates at all for a long time. In some +//! cases, we may still need to recover from smaller index even if flushed +//! index of all data CFs have advanced. So a special flushed index is +//! introduced and stored with raft CF (only using the name, raft CF is +//! dropped). It's the recommended recovery start point. How these two indexes +//! interact with each other can be found in the `ApplyTrace::recover` and +//! `ApplyTrace::maybe_advance_admin_flushed`. +//! +//! The correctness of raft cf index relies on the fact that: +//! - apply is sequential, so if any apply index is updated to apply trace, all +//! modification events before that must be processed. +//! - admin commands that marked by raft cf index must flush all data before +//! being executed. Note this contraint is not just for recovery, but also +//! necessary to guarantee safety of operations like split init or log gc. +//! So data of logs before raft cf index must be applied and flushed to disk. +//! +//! All apply related states are associated with an apply index. During +//! recovery states corresponding to the start index should be used. + +use std::{cmp, sync::Mutex}; + +use encryption_export::DataKeyManager; +use engine_traits::{ + data_cf_offset, ApplyProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, +}; +use fail::fail_point; +use kvproto::{ + metapb::Region, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, +}; +use raftstore::store::{ + ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, +}; +use slog::{info, trace, Logger}; +use tikv_util::{box_err, slog_panic, worker::Scheduler}; + +use crate::{ + operation::{ + command::temp_split_path, + ready::snapshot::{install_tablet, recv_snap_path}, + }, + raft::{Peer, Storage}, + router::PeerMsg, + Result, StoreRouter, +}; + +/// Write states for the given region. The region is supposed to have all its +/// data persisted and not governed by any raft group before. +pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { + let region_id = region.get_id(); + + let mut state = RegionLocalState::default(); + state.set_region(region); + state.set_tablet_index(RAFT_INIT_LOG_INDEX); + wb.put_region_state(region_id, RAFT_INIT_LOG_INDEX, &state)?; + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_term(RAFT_INIT_LOG_TERM); + wb.put_apply_state(region_id, RAFT_INIT_LOG_INDEX, &apply_state)?; + + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(RAFT_INIT_LOG_INDEX); + raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); + raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); + wb.put_raft_state(region_id, &raft_state)?; + + for cf in ALL_CFS { + wb.put_flushed_index(region_id, cf, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_INDEX)?; + } + + Ok(()) +} + +fn to_static_cf(cf: &str) -> &'static str { + match cf { + CF_DEFAULT => CF_DEFAULT, + CF_RAFT => CF_RAFT, + CF_WRITE => CF_WRITE, + CF_LOCK => CF_LOCK, + _ => unreachable!("unexpected cf: {cf}"), + } +} + +pub struct StateStorage { + raft_engine: ER, + router: Mutex>, +} + +impl StateStorage { + pub fn new(raft_engine: ER, router: StoreRouter) -> Self { + Self { + raft_engine, + router: Mutex::new(router), + } + } +} + +impl engine_traits::StateStorage for StateStorage { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress) { + let cf = to_static_cf(pr.cf()); + let flushed_index = pr.applied_index(); + self.raft_engine + .persist_progress(region_id, tablet_index, pr); + let _ = self.router.lock().unwrap().send( + region_id, + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + }, + ); + } +} + +/// Mapping from data cf to an u64 index. +pub type DataTrace = [u64; DATA_CFS_LEN]; + +#[derive(Clone, Copy, Default, Debug)] +struct Progress { + flushed: u64, + /// The index of last entry that has modification to the CF. The value + /// can be larger than the index that actually modifies the CF in apply. + /// + /// If `flushed` == `last_modified`, then all data in the CF is persisted. + last_modified: u64, +} + +/// `ApplyTrace` is used to track the indexes of modifications and flushes. +/// +/// It has 3 core functionalities: +/// - recover from stopped state and figure out the correct log replay start +/// point. +/// - trace the admin flushed index and issue persistence once admin operation +/// is considered finished. Note only those admin commands that needs to +/// interact with other peers will be traced. +/// - support query the flushed progress without actually scanning raft engine, +/// which is useful for cleaning up stale flush records. +#[derive(Default, Debug)] +pub struct ApplyTrace { + /// The modified indexes and flushed index of each data CF. + data_cfs: Box<[Progress; DATA_CFS_LEN]>, + /// The modified indexes and flushed index of raft CF. + /// + /// raft CF is a virtual CF that only used for recording apply index of + /// certain admin commands (like split/merge). So there is no flush at all. + /// The `flushed` field is advanced when the admin command doesn't need to + /// be replayed after restart. A write should be triggered to persist the + /// record. + admin: Progress, + /// Index that is issued to be written. It may not be truely persisted. + persisted_applied: u64, + /// Flush will be triggered explicitly when there are too many pending + /// writes. It marks the last index that is flushed to avoid too many + /// flushes. + last_flush_trigger: u64, + /// `true` means the raft cf record should be persisted in next ready. + try_persist: bool, +} + +impl ApplyTrace { + fn recover(region_id: u64, engine: &impl RaftEngine) -> Result<(Self, RegionLocalState)> { + let mut trace = ApplyTrace::default(); + // Get all the recorded apply index from data CFs. + for (off, cf) in DATA_CFS.iter().enumerate() { + // There should be at least one record. + let i = engine.get_flushed_index(region_id, cf)?.unwrap(); + trace.data_cfs[off].flushed = i; + trace.data_cfs[off].last_modified = i; + } + let i = engine.get_flushed_index(region_id, CF_RAFT)?.unwrap(); + // Index of raft CF means all data before that must be persisted. + trace.admin.flushed = i; + trace.admin.last_modified = i; + trace.persisted_applied = i; + trace.last_flush_trigger = i; + let applied_region_state = match engine.get_region_state(region_id, trace.admin.flushed)? { + Some(s) => s, + None => panic!( + "failed to get region state [region_id={}] [apply_trace={:?}]", + region_id, trace + ), + }; + Ok((trace, applied_region_state)) + } + + fn on_flush(&mut self, cf: &str, index: u64) { + let off = data_cf_offset(cf); + // Technically it should always be true. + if index > self.data_cfs[off].flushed { + self.data_cfs[off].flushed = index; + } + } + + fn on_modify(&mut self, cf: &str, index: u64) { + let off = data_cf_offset(cf); + self.data_cfs[off].last_modified = index; + } + + pub fn on_admin_flush(&mut self, index: u64) { + if index > self.admin.flushed { + self.admin.flushed = index; + self.try_persist = true; + } + } + + pub fn on_admin_modify(&mut self, index: u64) { + self.admin.last_modified = index; + } + + pub fn persisted_apply_index(&self) -> u64 { + self.persisted_applied + } + + pub fn should_flush(&mut self) -> bool { + if self.admin.flushed < self.admin.last_modified { + // It's waiting for other peers, flush will not help. + return false; + } + let last_modified = self + .data_cfs + .iter() + .filter_map(|pr| { + if pr.last_modified != pr.flushed { + Some(pr.last_modified) + } else { + None + } + }) + .max(); + if let Some(m) = last_modified && m >= self.admin.flushed + 4096000 && m >= self.last_flush_trigger + 4096000 { + self.last_flush_trigger = m; + true + } else { + false + } + } + + // All events before `mem_index` must be consumed before calling this function. + fn maybe_advance_admin_flushed(&mut self, mem_index: u64) { + if self.admin.flushed < self.admin.last_modified { + return; + } + let min_flushed = self + .data_cfs + .iter_mut() + // Only unflushed CFs are considered. Flushed CF always have uptodate changes + // persisted. + .filter_map(|pr| { + // All modifications before mem_index must be seen. If following condition is + // true, it means the modification comes beyond general apply process (like + // transaction GC unsafe write). Align `last_modified` to `flushed` to avoid + // blocking raft log GC. + if mem_index >= pr.flushed && pr.flushed > pr.last_modified { + pr.last_modified = pr.flushed; + } + if pr.last_modified != pr.flushed { + Some(pr.flushed) + } else { + None + } + }) + .min(); + // At best effort, we can only advance the index to `mem_index`. + let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); + if candidate > self.admin.flushed { + self.admin.flushed = candidate; + if self.admin.flushed > self.persisted_applied + 100 { + self.try_persist = true; + } + } + // TODO: persist admin.flushed every 10 minutes. + } + + /// Get the flushed indexes of all data CF that is needed when recoverying + /// logs. + /// + /// Logs may be replayed from the persisted apply index, but those data may + /// have been flushed in the past, so we need the flushed indexes to decide + /// what logs can be skipped for certain CFs. If all CFs are flushed before + /// the persisted apply index, then there is nothing to skipped, so + /// `None` is returned. + #[inline] + pub fn log_recovery(&self) -> Option> { + let mut flushed_indexes = [0; DATA_CFS_LEN]; + for (off, pr) in self.data_cfs.iter().enumerate() { + flushed_indexes[off] = pr.flushed; + } + for i in flushed_indexes { + if i > self.admin.flushed { + return Some(Box::new(flushed_indexes)); + } + } + None + } + + pub fn restore_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.last_modified = index; + } + self.admin.last_modified = index; + // Snapshot is a special case that KVs are not flushed yet, so all flushed + // state should not be changed. But persisted_applied is updated whenever an + // asynchronous write is triggered. So it can lead to a special case that + // persisted_applied < admin.flushed. It seems no harm ATM though. + self.persisted_applied = index; + self.try_persist = false; + } + + pub fn on_applied_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.flushed = index; + } + self.admin.flushed = index; + } + + #[inline] + pub fn should_persist(&self) -> bool { + self.try_persist + } +} + +impl Storage { + /// Creates a new storage with uninit states. + /// + /// This should only be used for creating new peer from raft message. + pub fn uninit( + store_id: u64, + region: Region, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result { + let mut region_state = RegionLocalState::default(); + region_state.set_region(region); + Self::create( + store_id, + region_state, + RaftLocalState::default(), + RaftApplyState::default(), + engine, + read_scheduler, + false, + ApplyTrace::default(), + logger, + ) + } + + /// Creates a new storage. + /// + /// All metadata should be initialized before calling this method. If the + /// region is destroyed, `None` will be returned. + pub fn new( + region_id: u64, + store_id: u64, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result>> { + // Check latest region state to determine whether the peer is destroyed. + let region_state = match engine.get_region_state(region_id, u64::MAX) { + Ok(Some(s)) => s, + res => { + return Err(box_err!( + "failed to get region state for region {}: {:?}", + region_id, + res + )); + } + }; + + if region_state.get_state() == PeerState::Tombstone { + return Ok(None); + } + + let (trace, region_state) = ApplyTrace::recover(region_id, &engine)?; + + let raft_state = match engine.get_raft_state(region_id) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get raft state: {:?}", res)); + } + }; + + let applied_index = trace.persisted_apply_index(); + let mut apply_state = match engine.get_apply_state(region_id, applied_index) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get apply state: {:?}", res)); + } + }; + apply_state.set_applied_index(applied_index); + (|| { + // Make node reply from start. + fail_point!("RESET_APPLY_INDEX_WHEN_RESTART", |_| { + apply_state.set_applied_index(5); + }); + })(); + + Self::create( + store_id, + region_state, + raft_state, + apply_state, + engine, + read_scheduler, + true, + trace, + logger, + ) + .map(Some) + } + + /// Region state is written before actually moving data. It's possible that + /// the tablet is missing after restart. We need to move the data again + /// after being restarted. + pub fn recover_tablet( + &self, + registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, + snap_mgr: &TabletSnapManager, + ) { + let tablet_index = self.region_state().get_tablet_index(); + if tablet_index == 0 { + // It's an uninitialized peer, nothing to recover. + return; + } + let region_id = self.region().get_id(); + let target_path = registry.tablet_path(region_id, tablet_index); + if target_path.exists() { + // Move data succeeded before restart, nothing to recover. + return; + } + if tablet_index == RAFT_INIT_LOG_INDEX { + // Its data may come from split or snapshot. Try split first. + let split_path = temp_split_path(registry, region_id); + if install_tablet(registry, key_manager, &split_path, region_id, tablet_index) { + return; + } + } + let truncated_index = self.entry_storage().truncated_index(); + if truncated_index == tablet_index { + // Try snapshot. + let peer_id = self.peer().get_id(); + let snap_path = recv_snap_path( + snap_mgr, + region_id, + peer_id, + self.entry_storage().truncated_term(), + tablet_index, + ); + if install_tablet(registry, key_manager, &snap_path, region_id, tablet_index) { + return; + } + } + slog_panic!( + self.logger(), + "tablet loss detected"; + "tablet_index" => tablet_index + ); + } + + /// Write initial persist trace for uninit peer. + pub fn init_apply_trace(&self, write_task: &mut WriteTask) { + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(3)); + lb.put_apply_state(region_id, 0, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, 0, self.region_state()) + .unwrap(); + for cf in ALL_CFS { + lb.put_flushed_index(region_id, cf, 0, 0).unwrap(); + } + } + + pub fn record_apply_trace(&mut self, write_task: &mut WriteTask) { + let trace = self.apply_trace(); + // Maybe tablet index can be different? + if trace.persisted_applied > trace.admin.flushed { + return; + } + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let tablet_index = self.tablet_index(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(1)); + info!(self.logger(), "persisting admin flushed"; "tablet_index" => tablet_index, "flushed" => trace.admin.flushed); + let trace = self.apply_trace_mut(); + lb.put_flushed_index(region_id, CF_RAFT, tablet_index, trace.admin.flushed) + .unwrap(); + trace.try_persist = false; + trace.persisted_applied = trace.admin.flushed; + } +} + +impl Peer { + pub fn on_data_flushed(&mut self, cf: &str, tablet_index: u64, index: u64) { + trace!(self.logger, "data flushed"; "cf" => cf, "tablet_index" => tablet_index, "index" => index, "trace" => ?self.storage().apply_trace()); + if tablet_index < self.storage().tablet_index() { + // Stale tablet. + return; + } + let apply_index = self.storage().entry_storage().applied_index(); + let apply_trace = self.storage_mut().apply_trace_mut(); + apply_trace.on_flush(cf, index); + apply_trace.maybe_advance_admin_flushed(apply_index); + } + + pub fn on_data_modified(&mut self, modification: DataTrace) { + trace!(self.logger, "on data modified"; "modification" => ?modification, "trace" => ?self.storage().apply_trace()); + let apply_index = self.storage().entry_storage().applied_index(); + let apply_trace = self.storage_mut().apply_trace_mut(); + for (cf, index) in DATA_CFS.iter().zip(modification) { + if index != 0 { + apply_trace.on_modify(cf, index); + } + } + apply_trace.maybe_advance_admin_flushed(apply_index); + } +} + +#[cfg(test)] +mod tests { + use engine_traits::RaftEngineReadOnly; + use kvproto::metapb::Peer; + use tempfile::TempDir; + + use super::*; + + fn new_region() -> Region { + let mut region = Region::default(); + region.set_id(4); + let mut p = Peer::default(); + p.set_id(5); + p.set_store_id(6); + region.mut_peers().push(p); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(4); + region + } + + #[test] + fn test_write_initial_states() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let engine = engine_test::new_temp_engine(&path); + let raft_engine = &engine.raft; + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + + let local_state = raft_engine.get_region_state(4, u64::MAX).unwrap().unwrap(); + assert_eq!(local_state.get_state(), PeerState::Normal); + assert_eq!(*local_state.get_region(), region); + assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + local_state, + raft_engine + .get_region_state(4, RAFT_INIT_LOG_INDEX) + .unwrap() + .unwrap() + ); + assert_eq!( + None, + raft_engine + .get_region_state(4, RAFT_INIT_LOG_INDEX - 1) + .unwrap() + ); + + let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); + assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); + let hs = raft_state.get_hard_state(); + assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); + + let apply_state = raft_engine.get_apply_state(4, u64::MAX).unwrap().unwrap(); + assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); + let ts = apply_state.get_truncated_state(); + assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); + assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!( + apply_state, + raft_engine + .get_apply_state(4, RAFT_INIT_LOG_INDEX) + .unwrap() + .unwrap() + ); + assert_eq!( + None, + raft_engine + .get_apply_state(4, RAFT_INIT_LOG_INDEX - 1) + .unwrap() + ); + } + + #[test] + fn test_apply_trace() { + let mut trace = ApplyTrace::default(); + assert_eq!(0, trace.admin.flushed); + // If there is no modifications, index should be advanced anyway. + trace.maybe_advance_admin_flushed(2); + assert_eq!(2, trace.admin.flushed); + for cf in DATA_CFS { + trace.on_modify(cf, 3); + } + trace.maybe_advance_admin_flushed(3); + // Modification is not flushed. + assert_eq!(2, trace.admin.flushed); + for cf in DATA_CFS { + trace.on_flush(cf, 3); + } + trace.maybe_advance_admin_flushed(3); + // No admin is recorded, index should be advanced. + assert_eq!(3, trace.admin.flushed); + trace.on_admin_modify(4); + for cf in DATA_CFS { + trace.on_flush(cf, 4); + } + for cf in DATA_CFS { + trace.on_modify(cf, 4); + } + trace.maybe_advance_admin_flushed(4); + // Unflushed admin modification should hold index. + assert_eq!(3, trace.admin.flushed); + trace.on_admin_flush(4); + trace.maybe_advance_admin_flushed(4); + // Admin is flushed, index should be advanced. + assert_eq!(4, trace.admin.flushed); + for cf in DATA_CFS { + trace.on_flush(cf, 5); + } + trace.maybe_advance_admin_flushed(4); + // Though all data CFs are flushed, but index should not be + // advanced as we don't know whether there is admin modification. + assert_eq!(4, trace.admin.flushed); + for cf in DATA_CFS { + trace.on_modify(cf, 5); + } + trace.maybe_advance_admin_flushed(5); + // Because modify is recorded, so we know there should be no admin + // modification and index can be advanced. + assert_eq!(5, trace.admin.flushed); + } + + #[test] + fn test_advance_admin_flushed() { + let cases = &[ + // When all are flushed, admin index should be advanced to latest. + ([(2, 2), (3, 3), (5, 5)], (3, 3), 5, 5), + ([(2, 2), (3, 3), (5, 5)], (5, 3), 6, 6), + // Any unflushed result should block advancing. + ([(2, 3), (3, 3), (5, 5)], (2, 2), 5, 2), + ([(2, 4), (3, 4), (5, 6)], (2, 2), 6, 2), + // But it should not make index go back. + ([(2, 4), (3, 4), (5, 6)], (3, 3), 6, 3), + // Unflush admin should not be advanced. + ([(2, 2), (3, 3), (5, 5)], (2, 3), 5, 2), + // Flushed may race with modification. + ([(2, 2), (3, 3), (6, 5)], (2, 2), 5, 5), + ([(8, 2), (9, 3), (7, 5)], (4, 4), 5, 5), + ([(8, 2), (9, 3), (7, 5)], (5, 5), 5, 5), + ([(2, 3), (9, 3), (7, 5)], (2, 2), 5, 2), + // In special cae, some CF may be flushed without any modification recorded, + // we should still able to advance the apply index forward. + ([(5, 2), (9, 3), (7, 3)], (2, 2), 3, 3), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 6, 6), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 10, 10), + ([(5, 2), (9, 3), (7, 3)], (2, 3), 10, 2), + ]; + for (case, (data_cfs, admin, mem_index, exp)) in cases.iter().enumerate() { + let mut trace = ApplyTrace::default(); + for (i, (flushed, modified)) in data_cfs.iter().enumerate() { + trace.data_cfs[i].flushed = *flushed; + trace.data_cfs[i].last_modified = *modified; + } + trace.admin.flushed = admin.0; + trace.admin.last_modified = admin.1; + trace.maybe_advance_admin_flushed(*mem_index); + assert_eq!(trace.admin.flushed, *exp, "{case}"); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs new file mode 100644 index 00000000000..96f1611d9f1 --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -0,0 +1,236 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::collections::VecDeque; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_serverpb::RaftMessage; +use raftstore::store::{ + local_metrics::RaftMetrics, Config, PersistedNotifier, WriteRouter, WriteRouterContext, + WriteSenders, WriteTask, +}; +use slog::{warn, Logger}; +use tikv_util::slog_panic; + +use crate::{ + batch::{StoreContext, StoreRouter}, + router::PeerMsg, +}; + +#[derive(Debug)] +struct UnpersistedReady { + /// Number of ready. + number: u64, + /// Max number of following ready whose data to be persisted is empty. + max_empty_number: u64, + raft_msgs: Vec>, + has_snapshot: bool, +} + +/// A writer that handles asynchronous writes. +pub struct AsyncWriter { + write_router: WriteRouter, + unpersisted_readies: VecDeque, + persisted_number: u64, + #[cfg(feature = "testexport")] + flush_subscribers: VecDeque<(u64, crate::router::FlushChannel)>, +} + +impl AsyncWriter { + pub fn new(region_id: u64, peer_id: u64) -> Self { + let write_router = WriteRouter::new(format!("[region {}] {}", region_id, peer_id)); + Self { + write_router, + unpersisted_readies: VecDeque::new(), + persisted_number: 0, + #[cfg(feature = "testexport")] + flush_subscribers: VecDeque::new(), + } + } + + /// Execute the task. + /// + /// If the task takes some time to finish, `None` is returned. Otherwise, + pub fn write( + &mut self, + ctx: &mut impl WriteRouterContext, + task: WriteTask, + ) -> Option> { + if task.has_data() { + self.send(ctx, task); + None + } else { + self.merge(task) + } + } + + pub fn known_largest_number(&self) -> u64 { + self.unpersisted_readies + .back() + .map(|r| r.number) + .unwrap_or(self.persisted_number) + } + + fn send(&mut self, ctx: &mut impl WriteRouterContext, task: WriteTask) { + let ready_number = task.ready_number(); + let has_snapshot = task.has_snapshot; + self.write_router.send_write_msg( + ctx, + self.unpersisted_readies.back().map(|r| r.number), + raftstore::store::WriteMsg::WriteTask(task), + ); + self.unpersisted_readies.push_back(UnpersistedReady { + number: ready_number, + max_empty_number: ready_number, + raft_msgs: vec![], + has_snapshot, + }); + } + + fn merge(&mut self, task: WriteTask) -> Option> { + if self.unpersisted_readies.is_empty() { + // If this ready don't need to be persisted and there is no previous unpersisted + // ready, we can safely consider it is persisted so the persisted msgs can be + // sent immediately. + self.persisted_number = task.ready_number(); + return Some(task); + } + + // Attach to the last unpersisted ready so that it can be considered to be + // persisted with the last ready at the same time. + let last = self.unpersisted_readies.back_mut().unwrap(); + last.max_empty_number = task.ready_number(); + if !task.messages.is_empty() { + last.raft_msgs.push(task.messages); + } + None + } + + /// Called when an asynchronous write has finished. + pub fn on_persisted( + &mut self, + ctx: &mut impl WriteRouterContext, + ready_number: u64, + logger: &Logger, + ) -> (Vec>, bool) { + if self.persisted_number >= ready_number { + return (vec![], false); + } + + let last_unpersisted = self.unpersisted_readies.back(); + if last_unpersisted.map_or(true, |u| u.number < ready_number) { + slog_panic!( + logger, + "ready number is too large"; + "last_unpersisted" => ?last_unpersisted, + "ready_number" => ready_number + ); + } + + let mut raft_messages = vec![]; + let mut has_snapshot = false; + // There must be a match in `self.unpersisted_readies`. + loop { + let Some(v) = self.unpersisted_readies.pop_front() else { + slog_panic!(logger, "ready number not found"; "ready_number" => ready_number); + }; + has_snapshot |= v.has_snapshot; + if v.number > ready_number { + slog_panic!( + logger, + "ready number not matched"; + "ready" => ?v, + "ready_number" => ready_number + ); + } + if raft_messages.is_empty() { + raft_messages = v.raft_msgs; + } else { + raft_messages.extend(v.raft_msgs); + } + if v.number == ready_number { + self.persisted_number = v.max_empty_number; + break; + } + } + + self.write_router + .check_new_persisted(ctx, self.persisted_number); + + (raft_messages, has_snapshot) + } + + pub fn persisted_number(&self) -> u64 { + self.persisted_number + } + + pub fn all_ready_persisted(&self) -> bool { + self.unpersisted_readies.is_empty() + } +} + +#[cfg(feature = "testexport")] +impl AsyncWriter { + pub fn subscirbe_flush(&mut self, ch: crate::router::FlushChannel) { + self.flush_subscribers + .push_back((self.known_largest_number(), ch)); + } + + pub fn notify_flush(&mut self) { + if self.flush_subscribers.is_empty() { + return; + } + if self.all_ready_persisted() { + for (_, ch) in self.flush_subscribers.drain(..) { + ch.set_result(()); + } + } + while let Some((number, ch)) = self.flush_subscribers.pop_front() { + // A channel is registered without ready, so persisted_number should be larger. + if self.persisted_number > number { + ch.set_result(()); + } else { + self.flush_subscribers.push_front((number, ch)); + break; + } + } + } +} + +impl WriteRouterContext for StoreContext +where + EK: KvEngine, + ER: RaftEngine, +{ + fn write_senders(&self) -> &WriteSenders { + &self.schedulers.write + } + + fn config(&self) -> &Config { + &self.cfg + } + + fn raft_metrics(&self) -> &RaftMetrics { + &self.raft_metrics + } +} + +impl PersistedNotifier for StoreRouter { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64) { + if let Err(e) = self.force_send( + region_id, + PeerMsg::Persisted { + peer_id, + ready_number, + }, + ) { + warn!( + self.logger(), + "failed to send noop to trigger persisted ready"; + "region_id" => region_id, + "peer_id" => peer_id, + "ready_number" => ready_number, + "error" => ?e, + ); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs new file mode 100644 index 00000000000..62e8fda7ba0 --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -0,0 +1,1081 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the actions that will drive a raft state machine. +//! +//! # Raft Ready +//! +//! Every messages or ticks may have side affect. Handling all those side +//! affect immediately is not efficient. Instead, tikv uses `Ready` to batch up +//! all the side affects and handle them at once for throughput. +//! +//! As raft store is the critical path in the whole system, we avoid most +//! blocking IO. So a typical processing is divided into two steps: +//! +//! - Handle raft ready to process the side affect and send IO tasks to +//! background threads +//! - Receive IO tasks completion and update the raft state machine +//! +//! There two steps can be processed concurrently. + +mod apply_trace; +mod async_writer; +mod snapshot; + +use std::{cmp, time::Instant}; + +use engine_traits::{KvEngine, RaftEngine}; +use error_code::ErrorCodeExt; +use kvproto::{ + metapb, + raft_cmdpb::AdminCmdType, + raft_serverpb::{ExtraMessageType, RaftMessage}, +}; +use protobuf::Message as _; +use raft::{eraftpb, prelude::MessageType, Ready, SnapshotStatus, StateRole, INVALID_ID}; +use raftstore::{ + coprocessor::{RegionChangeEvent, RoleChange}, + store::{ + needs_evict_entry_cache, + util::{self, is_initial_msg}, + worker_metrics::SNAP_COUNTER, + FetchedLogs, ReadProgress, Transport, WriteCallback, WriteTask, + }, +}; +use slog::{debug, error, info, trace, warn}; +use tikv_util::{ + log::SlogFormat, + slog_panic, + store::find_peer, + time::{duration_to_sec, monotonic_raw_now}, +}; + +pub use self::{ + apply_trace::{write_initial_states, ApplyTrace, DataTrace, StateStorage}, + async_writer::AsyncWriter, + snapshot::{GenSnapTask, SnapState}, +}; +use crate::{ + batch::StoreContext, + fsm::{PeerFsmDelegate, Store}, + operation::life::is_empty_split_message, + raft::{Peer, Storage}, + router::{PeerMsg, PeerTick}, + worker::tablet, +}; + +const PAUSE_FOR_RECOVERY_GAP: u64 = 128; + +impl Store { + pub fn on_store_unreachable( + &mut self, + ctx: &mut StoreContext, + to_store_id: u64, + ) where + EK: KvEngine, + ER: RaftEngine, + { + ctx.router + .broadcast_normal(|| PeerMsg::StoreUnreachable { to_store_id }); + } + + #[cfg(feature = "testexport")] + pub fn on_wait_flush( + &mut self, + ctx: &mut StoreContext, + region_id: u64, + ch: crate::router::FlushChannel, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let _ = ctx.router.send(region_id, PeerMsg::WaitFlush(ch)); + } +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + /// Raft relies on periodic ticks to keep the state machine sync with other + /// peers. + pub fn on_raft_tick(&mut self) { + if self.fsm.peer_mut().tick() { + self.fsm.peer_mut().set_has_ready(); + } + self.fsm.peer_mut().maybe_clean_up_stale_merge_context(); + self.schedule_tick(PeerTick::Raft); + } + + pub fn on_check_long_uncommitted(&mut self) { + if !self.fsm.peer().is_leader() { + return; + } + self.fsm + .peer_mut() + .check_long_uncommitted_proposals(self.store_ctx); + self.schedule_tick(PeerTick::CheckLongUncommitted); + } +} + +impl Peer { + pub fn maybe_pause_for_recovery(&mut self, store_ctx: &mut StoreContext) -> bool { + // The task needs to be scheduled even if the tablet may be replaced during + // recovery. Otherwise if there are merges during recovery, the FSM may + // be paused forever. + if self.storage().has_dirty_data() { + let region_id = self.region_id(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let tablet_index = self.storage().tablet_index(); + let _ = store_ctx.schedulers.tablet.schedule(tablet::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } + let entry_storage = self.storage().entry_storage(); + let committed_index = entry_storage.commit_index(); + let applied_index = entry_storage.applied_index(); + if committed_index > applied_index { + // Unlike v1, it's a must to set ready when there are pending entries. Otherwise + // it may block for ever when there is unapplied conf change. + self.set_has_ready(); + } + if committed_index > applied_index + PAUSE_FOR_RECOVERY_GAP { + // If there are too many the missing logs, we need to skip ticking otherwise + // it may block the raftstore thread for a long time in reading logs for + // election timeout. + info!(self.logger, "pause for recovery"; "applied" => applied_index, "committed" => committed_index); + self.set_pause_for_recovery(true); + true + } else { + false + } + } + + #[inline] + fn tick(&mut self) -> bool { + // When it's handling snapshot, it's pointless to tick as all the side + // affects have to wait till snapshot is applied. On the other hand, ticking + // will bring other corner cases like elections. + !self.is_handling_snapshot() && self.serving() && self.raft_group_mut().tick() + } + + pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { + if self.is_leader() { + self.raft_group_mut().report_unreachable(to_peer_id); + } + } + + pub fn on_store_unreachable(&mut self, to_store_id: u64) { + if self.is_leader() { + if let Some(peer_id) = find_peer(self.region(), to_store_id).map(|p| p.get_id()) { + self.raft_group_mut().report_unreachable(peer_id); + } + } + } + + pub fn on_raft_message( + &mut self, + ctx: &mut StoreContext, + mut msg: Box, + ) { + debug!( + self.logger, + "handle raft message"; + "message_type" => %util::MsgType(&msg), + "from_peer_id" => msg.get_from_peer().get_id(), + "to_peer_id" => msg.get_to_peer().get_id(), + ); + if self.pause_for_recovery() && msg.get_message().get_msg_type() == MessageType::MsgAppend { + ctx.raft_metrics.message_dropped.recovery.inc(); + return; + } + if !self.serving() { + return; + } + if util::is_vote_msg(msg.get_message()) && self.maybe_gc_sender(&msg) { + return; + } + if msg.get_to_peer().get_store_id() != self.peer().get_store_id() { + ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); + return; + } + if msg.get_is_tombstone() { + self.on_tombstone_message(&mut msg); + return; + } + if msg.has_extra_msg() && msg.get_to_peer().get_id() == self.peer_id() { + // GcRequest/GcResponse may be sent from/to different regions, skip further + // checks. + match msg.get_extra_msg().get_type() { + ExtraMessageType::MsgGcPeerResponse => { + self.on_gc_peer_response(&msg); + return; + } + ExtraMessageType::MsgGcPeerRequest => { + self.on_gc_peer_request(ctx, &msg); + return; + } + ExtraMessageType::MsgFlushMemtable => { + let region_epoch = msg.as_ref().get_region_epoch(); + if util::is_epoch_stale(region_epoch, self.region().get_region_epoch()) { + return; + } + let _ = ctx + .schedulers + .tablet + .schedule(crate::worker::tablet::Task::Flush { + region_id: self.region().get_id(), + cb: None, + }); + return; + } + ExtraMessageType::MsgWantRollbackMerge => { + if self.is_leader() { + // TODO: + // self.merge_context_mut().maybe_add_rollback_peer(); + return; + } + } + ExtraMessageType::MsgAvailabilityRequest => { + self.on_availability_request( + ctx, + msg.get_extra_msg() + .get_availability_context() + .get_from_region_id(), + msg.get_from_peer(), + ); + return; + } + ExtraMessageType::MsgAvailabilityResponse => { + self.on_availability_response( + ctx, + msg.get_from_peer().get_id(), + msg.get_extra_msg(), + ); + return; + } + _ => (), + } + } + if !msg.has_region_epoch() { + ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); + return; + } + if msg.has_merge_target() { + unimplemented!(); + // return; + } + // We don't handle stale message like v1, as we rely on leader to actively + // cleanup stale peers. + let to_peer = msg.get_to_peer(); + // Check if the message is sent to the right peer. + match to_peer.get_id().cmp(&self.peer_id()) { + cmp::Ordering::Equal => (), + cmp::Ordering::Less => { + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + cmp::Ordering::Greater => { + // We need to create the target peer. + info!(self.logger, "mark for destroy for larger ID"; "larger_id" => to_peer.get_id()); + self.mark_for_destroy(Some(msg)); + return; + } + } + if msg.has_extra_msg() { + unimplemented!(); + } + + // TODO: drop all msg append when the peer is uninitialized and has conflict + // ranges with other peers. + let from_peer = msg.take_from_peer(); + let from_peer_id = from_peer.get_id(); + if from_peer_id != INVALID_ID { + if self.is_leader() { + self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); + } + // We only cache peer with an vaild ID. + // It prevents cache peer(0,0) which is sent by region split. + self.insert_peer_cache(from_peer); + } + let pre_committed_index = self.raft_group().raft.raft_log.committed; + if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { + self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) + } else { + // This can be a message that sent when it's still a follower. Nevertheleast, + // it's meaningless to continue to handle the request as callbacks are cleared. + if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + && self.is_leader() + && (msg.get_message().get_from() == raft::INVALID_ID + || msg.get_message().get_from() == self.peer_id()) + { + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + // As this peer is already created, the empty split message is meaningless. + if is_empty_split_message(&msg) { + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + + if let Err(e) = self.raft_group_mut().step(msg.take_message()) { + error!(self.logger, "raft step error"; "err" => ?e); + } else { + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); + } + } + + // There are two different cases to check peers can be bring back. + // 1. If the peer is pending, then only AppendResponse can bring it back to up. + // 2. If the peer is down, then HeartbeatResponse and AppendResponse can bring + // it back to up. + if self.any_new_peer_catch_up(from_peer_id) { + self.region_heartbeat_pd(ctx) + } + + self.set_has_ready(); + } + + /// Callback for fetching logs asynchronously. + pub fn on_logs_fetched(&mut self, fetched_logs: FetchedLogs) { + let FetchedLogs { context, logs } = fetched_logs; + let low = logs.low; + if !self.is_leader() { + self.entry_storage_mut().clean_async_fetch_res(low); + return; + } + if self.term() != logs.term { + self.entry_storage_mut().clean_async_fetch_res(low); + } else { + self.entry_storage_mut() + .update_async_fetch_res(low, Some(logs)); + } + self.raft_group_mut().on_entries_fetched(context); + // clean the async fetch result immediately if not used to free memory + self.entry_storage_mut().update_async_fetch_res(low, None); + self.set_has_ready(); + } + + /// Partially filled a raft message that will be sent to other peer. + fn prepare_raft_message(&mut self) -> RaftMessage { + let mut raft_msg = RaftMessage::new(); + raft_msg.set_region_id(self.region().id); + raft_msg.set_from_peer(self.peer().clone()); + // set current epoch + let epoch = self.storage().region().get_region_epoch(); + let msg_epoch = raft_msg.mut_region_epoch(); + msg_epoch.set_version(epoch.get_version()); + msg_epoch.set_conf_ver(epoch.get_conf_ver()); + raft_msg + } + + /// Transform a message from raft lib to a message that can be sent to other + /// peers. + /// + /// If the recipient can't be found, `None` is returned. + #[inline] + fn build_raft_message(&mut self, msg: eraftpb::Message) -> Option { + let to_peer = match self.peer_from_cache(msg.to) { + Some(p) => p, + None => { + warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to, "message_type" => ?msg.msg_type); + return None; + } + }; + let to_peer_is_learner = to_peer.get_role() == metapb::PeerRole::Learner; + + let mut raft_msg = self.prepare_raft_message(); + + raft_msg.set_to_peer(to_peer); + if msg.from != self.peer().id { + debug!( + self.logger, + "redirecting message"; + "msg_type" => ?msg.get_msg_type(), + "from" => msg.get_from(), + "to" => msg.get_to(), + ); + } + + // Filling start and end key is only needed for being compatible with + // raftstore v1 learners (e.g. tiflash engine). + // + // There could be two cases: + // - Target peer already exists but has not established communication with + // leader yet + // - Target peer is added newly due to member change or region split, but it's + // not created yet + // For both cases the region start key and end key are attached in RequestVote + // and Heartbeat message for the store of that peer to check whether to create a + // new peer when receiving these messages, or just to wait for a pending region + // split to perform later. + if self.storage().is_initialized() && is_initial_msg(&msg) && to_peer_is_learner { + let region = self.region(); + raft_msg.set_start_key(region.get_start_key().to_vec()); + raft_msg.set_end_key(region.get_end_key().to_vec()); + } + + raft_msg.set_message(msg); + Some(raft_msg) + } + + /// Send a message. + /// + /// The message is pushed into the send buffer, it may not be sent out until + /// transport is flushed explicitly. + pub(crate) fn send_raft_message( + &mut self, + ctx: &mut StoreContext, + msg: RaftMessage, + ) { + let msg_type = msg.get_message().get_msg_type(); + let to_peer_id = msg.get_to_peer().get_id(); + let to_store_id = msg.get_to_peer().get_store_id(); + if msg_type == MessageType::MsgSnapshot { + let index = msg.get_message().get_snapshot().get_metadata().get_index(); + self.update_last_sent_snapshot_index(index); + } + + trace!( + self.logger, + "send raft msg"; + "msg_type" => ?msg_type, + "msg_size" => msg.get_message().compute_size(), + "to" => to_peer_id, + ); + + match ctx.trans.send(msg) { + Ok(()) => ctx.raft_metrics.send_message.add(msg_type, true), + Err(e) => { + // We use metrics to observe failure on production. + debug!( + self.logger, + "failed to send msg to other peer"; + "target_peer_id" => to_peer_id, + "target_store_id" => to_store_id, + "err" => ?e, + "error_code" => %e.error_code(), + ); + // unreachable store + self.raft_group_mut().report_unreachable(to_peer_id); + if msg_type == eraftpb::MessageType::MsgSnapshot { + self.raft_group_mut() + .report_snapshot(to_peer_id, SnapshotStatus::Failure); + } + ctx.raft_metrics.send_message.add(msg_type, false); + } + } + } + + /// Send a message. + /// + /// The message is pushed into the send buffer, it may not be sent out until + /// transport is flushed explicitly. + fn send_raft_message_on_leader( + &mut self, + ctx: &mut StoreContext, + msg: RaftMessage, + ) { + let message = msg.get_message(); + if message.get_msg_type() == MessageType::MsgAppend + && let Some(fe) = message.get_entries().first() + && let Some(le) = message.get_entries().last() + { + let last = (le.get_term(), le.get_index()); + let first = (fe.get_term(), fe.get_index()); + let now = Instant::now(); + let queue = self.proposals_mut().queue_mut(); + // Proposals are batched up, so it will liely hit after one or two steps. + for p in queue.iter_mut().rev() { + if p.sent { + break; + } + let cur = (p.term, p.index); + if cur > last { + continue; + } + if cur < first { + break; + } + for tracker in p.cb.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_send_proposal, |t| { + &mut t.metrics.wf_send_proposal_nanos + }); + } + p.sent = true; + } + } + if message.get_msg_type() == MessageType::MsgTimeoutNow { + // After a leader transfer procedure is triggered, the lease for + // the old leader may be expired earlier than usual, since a new leader + // may be elected and the old leader doesn't step down due to + // network partition from the new leader. + // For lease safety during leader transfer, transit `leader_lease` + // to suspect. + self.leader_lease_mut().suspect(monotonic_raw_now()); + } + self.send_raft_message(ctx, msg) + } + + fn handle_raft_committed_entries( + &mut self, + ctx: &mut crate::batch::StoreContext, + committed_entries: Vec, + ) { + // TODO: skip handling committed entries if a snapshot is being applied + // asynchronously. + let mut update_lease = self.is_leader(); + if update_lease { + for entry in committed_entries.iter().rev() { + self.compact_log_context_mut() + .add_log_size(entry.get_data().len() as u64); + if update_lease { + let propose_time = self + .proposals() + .find_propose_time(entry.get_term(), entry.get_index()); + if let Some(propose_time) = propose_time { + // We must renew current_time because this value may be created a long time + // ago. If we do not renew it, this time may be + // smaller than propose_time of a command, which was + // proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. + let current_time = monotonic_raw_now(); + ctx.current_time.replace(current_time); + ctx.raft_metrics.commit_log.observe(duration_to_sec( + (current_time - propose_time).to_std().unwrap(), + )); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); + update_lease = false; + } + } + } + } + let applying_index = committed_entries.last().unwrap().index; + let commit_to_current_term = committed_entries.last().unwrap().term == self.term(); + self.compact_log_context_mut() + .set_last_applying_index(applying_index); + if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { + // Compact all cached entries instead of half evict. + self.entry_storage_mut().evict_entry_cache(false); + } + self.schedule_apply_committed_entries(ctx, committed_entries); + if self.is_leader() + && commit_to_current_term + && !self.proposal_control().has_uncommitted_admin() + { + self.raft_group_mut().skip_bcast_commit(true); + } + } + + /// Processing the ready of raft. A detail description of how it's handled + /// can be found at https://docs.rs/raft/latest/raft/#processing-the-ready-state. + /// + /// It's should be called at the end of every round of processing. Any + /// writes will be handled asynchronously, and be notified once writes + /// are persisted. + #[inline] + pub fn handle_raft_ready(&mut self, ctx: &mut StoreContext) { + let has_ready = self.reset_has_ready(); + let has_extra_write = self.reset_has_extra_write(); + if !has_ready || self.destroy_progress().started() { + #[cfg(feature = "testexport")] + self.async_writer.notify_flush(); + return; + } + ctx.has_ready = true; + + if !has_extra_write + && !self.has_pending_messages() + && !self.raft_group().has_ready() + && (self.serving() || self.postponed_destroy()) + { + self.maybe_schedule_gen_snapshot(); + #[cfg(feature = "testexport")] + self.async_writer.notify_flush(); + return; + } + + // Note even the group has no ready, we can still get an empty ready. + + debug!(self.logger, "handle raft ready"); + + let mut ready = self.raft_group_mut().ready(); + // Update it after unstable entries pagination is introduced. + debug_assert!(ready.entries().last().map_or_else( + || true, + |entry| entry.index == self.raft_group().raft.raft_log.last_index() + )); + + self.on_role_changed(ctx, &ready); + + if let Some(hs) = ready.hs() { + let prev_commit_index = self.entry_storage().commit_index(); + assert!( + hs.get_commit() >= prev_commit_index, + "{} {:?} {}", + SlogFormat(&self.logger), + hs, + prev_commit_index + ); + if self.is_leader() && hs.get_commit() > prev_commit_index { + self.on_leader_commit_index_changed(hs.get_commit()); + } + } + + if !ready.messages().is_empty() { + debug_assert!(self.is_leader()); + for msg in ready.take_messages() { + if let Some(msg) = self.build_raft_message(msg) { + self.send_raft_message_on_leader(ctx, msg); + } + } + if self.has_pending_messages() { + for msg in self.take_pending_messages() { + self.send_raft_message_on_leader(ctx, msg); + } + } + } + + self.apply_reads(ctx, &ready); + if !ready.committed_entries().is_empty() { + self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); + } + + self.maybe_schedule_gen_snapshot(); + + let ready_number = ready.number(); + let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + self.report_send_to_queue_duration(ctx, &mut write_task, ready.entries()); + let prev_persisted = self.storage().apply_trace().persisted_apply_index(); + self.merge_state_changes_to(&mut write_task); + self.storage_mut() + .handle_raft_ready(ctx, &mut ready, &mut write_task); + self.try_compelete_recovery(); + self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); + + if !ready.persisted_messages().is_empty() { + write_task.messages = ready + .take_persisted_messages() + .into_iter() + .flat_map(|m| self.build_raft_message(m)) + .collect(); + } + if self.has_pending_messages() { + if write_task.messages.is_empty() { + write_task.messages = self.take_pending_messages(); + } else { + write_task + .messages + .append(&mut self.take_pending_messages()); + } + } + if !self.serving() { + self.start_destroy(ctx, &mut write_task); + if self.persisted_index() != 0 { + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.raft_group().raft.state, + ); + } + } + // Ready number should increase monotonically. + assert!(self.async_writer.known_largest_number() < ready.number()); + if let Some(task) = self.async_writer.write(ctx, write_task) { + // So the task doesn't need to be process asynchronously, directly advance. + let mut light_rd = self.raft_group_mut().advance_append(ready); + if !task.messages.is_empty() { + for m in task.messages { + self.send_raft_message(ctx, m); + } + } + if !light_rd.messages().is_empty() || light_rd.commit_index().is_some() { + slog_panic!( + self.logger, + "unexpected messages"; + "messages_count" => ?light_rd.messages().len(), + "commit_index" => ?light_rd.commit_index() + ); + } + if !light_rd.committed_entries().is_empty() { + self.handle_raft_committed_entries(ctx, light_rd.take_committed_entries()); + } + } else { + // The task will be written asynchronously. Once it's persisted, it will be + // notified by `on_persisted`. + self.raft_group_mut().advance_append_async(ready); + } + + ctx.raft_metrics.ready.has_ready_region.inc(); + #[cfg(feature = "testexport")] + self.async_writer.notify_flush(); + } + + /// Called when an asynchronously write finishes. + pub fn on_persisted( + &mut self, + ctx: &mut StoreContext, + peer_id: u64, + ready_number: u64, + ) { + if peer_id != self.peer_id() { + error!(self.logger, "peer id not matched"; "persisted_peer_id" => peer_id, "persisted_number" => ready_number); + return; + } + let (persisted_message, has_snapshot) = + self.async_writer + .on_persisted(ctx, ready_number, &self.logger); + for msgs in persisted_message { + for msg in msgs { + self.send_raft_message(ctx, msg); + } + } + + let persisted_number = self.async_writer.persisted_number(); + let pre_persisted_index = self.persisted_index(); + let pre_committed_index = self.raft_group().raft.raft_log.committed; + self.raft_group_mut().on_persist_ready(persisted_number); + let persisted_index = self.persisted_index(); + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_persist_log_duration(ctx, pre_persisted_index, persisted_index); + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); + // The apply snapshot process order would be: + // - Get the snapshot from the ready + // - Wait for async writer to load this tablet + // In this step, the snapshot loading has been finished, but some apply + // state need to update. + if has_snapshot { + self.on_applied_snapshot(ctx); + } + + self.storage_mut() + .entry_storage_mut() + .update_cache_persisted(persisted_index); + if !self.destroy_progress().started() { + // We may need to check if there is persisted committed logs. + self.set_has_ready(); + } else if self.async_writer.all_ready_persisted() { + // Destroy ready is the last ready. All readies are persisted means destroy + // is persisted. + self.finish_destroy(ctx); + } + } + + #[inline] + fn report_persist_log_duration( + &self, + ctx: &mut StoreContext, + old_index: u64, + new_index: u64, + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || old_index >= new_index { + return; + } + let now = Instant::now(); + for i in old_index + 1..=new_index { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + for tracker in trackers { + tracker.observe(now, &ctx.raft_metrics.wf_persist_log, |t| { + &mut t.metrics.wf_persist_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_commit_log_duration( + &self, + ctx: &mut StoreContext, + old_index: u64, + new_index: u64, + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || old_index >= new_index { + return; + } + let now = Instant::now(); + for i in old_index + 1..=new_index { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + let commit_persisted = i <= self.persisted_index(); + let hist = if commit_persisted { + &ctx.raft_metrics.wf_commit_log + } else { + &ctx.raft_metrics.wf_commit_not_persist_log + }; + for tracker in trackers { + tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_send_to_queue_duration( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + entries: &[raft::eraftpb::Entry], + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() { + return; + } + let now = Instant::now(); + for entry in entries { + if let Some((term, trackers)) = self.proposals().find_trackers(entry.index) { + if entry.term == term { + for tracker in trackers { + write_task.trackers.push(*tracker); + tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { + &mut t.metrics.wf_send_to_queue_nanos + }); + } + } + } + } + } + + #[cfg(feature = "testexport")] + pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { + self.async_writer.subscirbe_flush(ch); + } + + pub fn on_role_changed(&mut self, ctx: &mut StoreContext, ready: &Ready) { + // Update leader lease when the Raft state changes. + if let Some(ss) = ready.ss() { + let term = self.term(); + match ss.raft_state { + StateRole::Leader => { + // The local read can only be performed after a new leader has applied + // the first empty entry on its term. After that the lease expiring time + // should be updated to + // send_to_quorum_ts + max_lease + // as the comments in `Lease` explain. + // It is recommended to update the lease expiring time right after + // this peer becomes leader because it's more convenient to do it here and + // it has no impact on the correctness. + let progress_term = ReadProgress::term(term); + self.maybe_renew_leader_lease( + monotonic_raw_now(), + &ctx.store_meta, + Some(progress_term), + ); + debug!( + self.logger, + "becomes leader with lease"; + "lease" => ?self.leader_lease(), + ); + // If the predecessor reads index during transferring leader and receives + // quorum's heartbeat response after that, it may wait for applying to + // current term to apply the read. So broadcast eagerly to avoid unexpected + // latency. + self.raft_group_mut().skip_bcast_commit(false); + self.update_last_sent_snapshot_index( + self.raft_group().raft.raft_log.last_index(), + ); + + self.txn_context().on_became_leader( + ctx, + self.term(), + self.region(), + &self.logger, + ); + + // Exit entry cache warmup state when the peer becomes leader. + self.entry_storage_mut().clear_entry_cache_warmup_state(); + + self.region_heartbeat_pd(ctx); + self.add_pending_tick(PeerTick::CompactLog); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::CheckLongUncommitted); + self.add_pending_tick(PeerTick::ReportBuckets); + self.maybe_schedule_gc_peer_tick(); + } + StateRole::Follower => { + self.leader_lease_mut().expire(); + self.storage_mut().cancel_generating_snap(None); + self.txn_context() + .on_became_follower(self.term(), self.region()); + self.update_merge_progress_on_became_follower(); + } + _ => {} + } + self.read_progress() + .update_leader_info(ss.leader_id, term, self.region()); + let target = self.refresh_leader_transferee(); + ctx.coprocessor_host.on_role_change( + self.region(), + RoleChange { + state: ss.raft_state, + leader_id: ss.leader_id, + prev_lead_transferee: target, + vote: self.raft_group().raft.vote, + initialized: self.storage().is_initialized(), + peer_id: self.peer().get_id(), + }, + ); + self.proposal_control_mut().maybe_update_term(term); + } + } + + /// If leader commits new admin commands, it may break lease assumption. So + /// we need to cancel lease whenever necessary. + /// + /// Note this method should be called before sending out any messages. + fn on_leader_commit_index_changed(&mut self, commit_index: u64) { + let mut committed_prepare_merge = false; + self.proposal_control_mut().commit_to(commit_index, |cmd| { + committed_prepare_merge |= cmd.cmd_type() == AdminCmdType::PrepareMerge + }); + // There are two types of operations that will change the ownership of a range: + // split and merge. + // + // - For split, after the split command is committed, it's + // possible that the same range is govened by different region on different + // nodes due to different apply progress. But because only the peers on the + // same node as old leader will campaign despite election timeout, so there + // will be no modification to the overlapped range until either the original + // leader apply the split command or an election timeout is passed since split + // is committed. We already forbid renewing lease after committing split, and + // original leader will update the reader delegate with latest epoch after + // applying split before the split peer starts campaign, so what needs to be + // done are 1. mark split is committed, which is done by `commit_to` above, + // 2. make sure split result is invisible until epoch is updated or reader may + // miss data from the new tablet. This is done by always publish tablet in + // `on_apply_res_split`. So it's correct to allow local read during split. + // + // - For merge, after the prepare merge command is committed, the target peers + // may apply commit merge at any time, so we need to forbid any type of read + // to avoid missing the modifications from target peers. + if committed_prepare_merge { + // After prepare_merge is committed and the leader broadcasts commit + // index to followers, the leader can not know when the target region + // merges majority of this region, also it can not know when the target + // region writes new values. + // To prevent unsafe local read, we suspect its leader lease. + self.leader_lease_mut().suspect(monotonic_raw_now()); + // Stop updating `safe_ts` + self.read_progress_mut().discard(); + } + } + + /// Check if there is long uncommitted proposal. + /// + /// This will increase the threshold when a long uncommitted proposal is + /// detected, and reset the threshold when there is no long uncommitted + /// proposal. + fn has_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) -> bool { + let mut has_long_uncommitted = false; + let base_threshold = ctx.cfg.long_uncommitted_base_threshold.0; + if let Some(propose_time) = self.proposals().oldest().and_then(|p| p.propose_time) { + // When a proposal was proposed with this ctx before, the current_time can be + // some. + let current_time = *ctx.current_time.get_or_insert_with(monotonic_raw_now); + let elapsed = match (current_time - propose_time).to_std() { + Ok(elapsed) => elapsed, + Err(_) => return false, + }; + // Increase the threshold for next turn when a long uncommitted proposal is + // detected. + let threshold = self.long_uncommitted_threshold(); + if elapsed >= threshold { + has_long_uncommitted = true; + self.set_long_uncommitted_threshold(threshold + base_threshold); + } else if elapsed < base_threshold { + self.set_long_uncommitted_threshold(base_threshold); + } + } else { + self.set_long_uncommitted_threshold(base_threshold); + } + has_long_uncommitted + } + + fn check_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) { + if self.has_long_uncommitted_proposals(ctx) { + let status = self.raft_group().status(); + let mut buffer: Vec<(u64, u64, u64)> = Vec::new(); + if let Some(prs) = status.progress { + for (id, p) in prs.iter() { + buffer.push((*id, p.commit_group_id, p.matched)); + } + } + warn!( + self.logger, + "found long uncommitted proposals"; + "progress" => ?buffer, + "cache_first_index" => ?self.entry_storage().entry_cache_first_index(), + "next_turn_threshold" => ?self.long_uncommitted_threshold(), + ); + } + } +} + +impl Storage { + /// Apply the ready to the storage. If there is any states need to be + /// persisted, it will be written to `write_task`. + fn handle_raft_ready( + &mut self, + ctx: &mut StoreContext, + ready: &mut Ready, + write_task: &mut WriteTask, + ) { + let prev_raft_state = self.entry_storage().raft_state().clone(); + let prev_ever_persisted = self.ever_persisted(); + + if !ready.snapshot().is_empty() { + if let Err(e) = self.apply_snapshot( + ready.snapshot(), + write_task, + &ctx.snap_mgr, + &ctx.tablet_registry, + ctx.key_manager.as_ref(), + ) { + SNAP_COUNTER.apply.fail.inc(); + error!(self.logger(),"failed to apply snapshot";"error" => ?e) + } + } + + if !ready.entries().is_empty() { + assert!(self.ever_persisted(), "{}", SlogFormat(self.logger())); + self.entry_storage_mut() + .append(ready.take_entries(), write_task); + } + if let Some(hs) = ready.hs() { + self.entry_storage_mut() + .raft_state_mut() + .set_hard_state(hs.clone()); + } + let entry_storage = self.entry_storage(); + if !prev_ever_persisted || prev_raft_state != *entry_storage.raft_state() { + write_task.raft_state = Some(entry_storage.raft_state().clone()); + } + // If snapshot initializes the peer (in `apply_snapshot`), we don't need to + // write apply trace again. + if !self.ever_persisted() { + let region_id = self.region().get_id(); + let raft_engine = entry_storage.raft_engine(); + if write_task.raft_wb.is_none() { + write_task.raft_wb = Some(raft_engine.log_batch(64)); + } + let wb = write_task.raft_wb.as_mut().unwrap(); + // There may be tombstone key from last peer. + raft_engine + .clean(region_id, 0, entry_storage.raft_state(), wb) + .unwrap_or_else(|e| { + slog_panic!(self.logger(), "failed to clean up region"; "error" => ?e); + }); + self.init_apply_trace(write_task); + self.set_ever_persisted(); + } + if self.apply_trace().should_persist() { + self.record_apply_trace(write_task); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs new file mode 100644 index 00000000000..5547df7d580 --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -0,0 +1,683 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains snapshot relative processing logic. +//! +//! # Snapshot State +//! +//! generator and apply snapshot works asynchronously. the snap_sate indicates +//! the curren snapshot state. +//! +//! # Process Overview +//! +//! generate snapshot: +//! - Raft call `snapshot` interface to acquire a snapshot, then storage setup +//! the gen_snap_task. +//! - handle ready will send the gen_snap_task to the apply work +//! - apply worker schedule a gen tablet snapshot task to async read worker with +//! region state and apply state. +//! - async read worker generates the tablet snapshot and sends the result to +//! peer fsm, then Raft will get the snapshot. + +use std::{ + assert_matches::assert_matches, + fmt::{self, Debug}, + fs, + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + Arc, + }, +}; + +use encryption_export::DataKeyManager; +use engine_traits::{ + EncryptionKeyManager, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, + ALL_CFS, +}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; +use protobuf::Message; +use raft::{eraftpb::Snapshot, StateRole}; +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{ + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, worker_metrics::SNAP_COUNTER, + GenSnapRes, ReadTask, TabletSnapKey, TabletSnapManager, Transport, WriteTask, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + }, +}; +use slog::{debug, error, info, warn}; +use tikv_util::{box_err, log::SlogFormat, slog_panic}; + +use crate::{ + fsm::ApplyResReporter, + operation::{command::temp_split_path, SharedReadTablet}, + raft::{Apply, Peer, Storage}, + router::ApplyTask, + worker::tablet, + Result, StoreContext, +}; + +#[derive(Debug)] +pub enum SnapState { + Relax, + Generating { + canceled: Arc, + index: Arc, + }, + Generated(Box), +} + +impl PartialEq for SnapState { + fn eq(&self, other: &SnapState) -> bool { + match (self, other) { + (SnapState::Relax, SnapState::Relax) + | (SnapState::Generating { .. }, SnapState::Generating { .. }) => true, + (SnapState::Generated(snap1), SnapState::Generated(snap2)) => *snap1 == *snap2, + _ => false, + } + } +} + +pub struct GenSnapTask { + region_id: u64, + // The snapshot will be sent to the peer. + to_peer: u64, + // Fill it when you are going to generate the snapshot. + // index used to check if the gen task should be canceled. + index: Arc, + // Set it to true to cancel the task if necessary. + canceled: Arc, + // indicates whether the snapshot is triggered due to load balance + for_balance: bool, +} + +impl GenSnapTask { + pub fn new( + region_id: u64, + to_peer: u64, + index: Arc, + canceled: Arc, + ) -> GenSnapTask { + GenSnapTask { + region_id, + to_peer, + index, + canceled, + for_balance: false, + } + } + + pub fn set_for_balance(&mut self) { + self.for_balance = true; + } + + pub fn to_peer(&self) -> u64 { + self.to_peer + } +} + +impl Debug for GenSnapTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("GenSnapTask") + .field("region_id", &self.region_id) + .finish() + } +} + +pub fn recv_snap_path( + snap_mgr: &TabletSnapManager, + region_id: u64, + peer_id: u64, + term: u64, + index: u64, +) -> PathBuf { + let key = TabletSnapKey::new(region_id, peer_id, term, index); + snap_mgr.final_recv_path(&key) +} + +/// Move the tablet from `source` to managed path. +/// +/// Returns false if `source` doesn't exist. +pub fn install_tablet( + registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, + source: &Path, + region_id: u64, + tablet_index: u64, +) -> bool { + if !source.exists() { + return false; + } + let target_path = registry.tablet_path(region_id, tablet_index); + assert_matches!( + EK::locked(source.to_str().unwrap()), + Ok(false), + "source is locked: {} => {}", + source.display(), + target_path.display() + ); + if let Some(m) = &key_manager { + m.link_file(source.to_str().unwrap(), target_path.to_str().unwrap()) + .unwrap(); + } + if let Err(e) = fs::rename(source, &target_path) { + if let Some(m) = &key_manager { + m.delete_file(target_path.to_str().unwrap()).unwrap(); + } + panic!( + "failed to rename tablet {} => {}: {:?}", + source.display(), + target_path.display(), + e + ); + } + if let Some(m) = &key_manager { + m.delete_file(source.to_str().unwrap()).unwrap(); + } + true +} + +impl Peer { + /// Check whether there is a pending generate snapshot task, the task + /// needs to be sent to the apply system. + /// Always sending snapshot task after apply task, so it gets latest + /// snapshot. + #[inline] + pub fn maybe_schedule_gen_snapshot(&mut self) { + if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { + self.apply_scheduler() + .unwrap() + .send(ApplyTask::Snapshot(gen_task)); + } + } + + pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { + if self.storage_mut().on_snapshot_generated(snapshot) { + self.raft_group_mut().ping(); + self.set_has_ready(); + } + } + + pub fn on_snapshot_sent(&mut self, to_peer_id: u64, status: raft::SnapshotStatus) { + let to_peer = match self.peer_from_cache(to_peer_id) { + Some(peer) => peer, + None => { + // If to_peer is gone, ignore this snapshot status + warn!( + self.logger, + "peer not found, ignore snapshot status"; + "to_peer_id" => to_peer_id, + "status" => ?status, + ); + return; + } + }; + info!( + self.logger, + "report snapshot status"; + "to" => ?to_peer, + "status" => ?status, + ); + self.raft_group_mut().report_snapshot(to_peer_id, status); + } + + pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); + let persisted_index = self.persisted_index(); + self.compact_log_context_mut() + .set_last_applying_index(persisted_index); + let snapshot_index = self.entry_storage().truncated_index(); + assert!(snapshot_index >= RAFT_INIT_LOG_INDEX, "{:?}", self.logger); + // If leader sends a message append to the follower while it's applying + // snapshot (via split init for example), the persisted_index may be larger + // than the first index. But as long as first index is not larger, the + // latest snapshot should be applied. + if snapshot_index <= persisted_index { + let region_id = self.region_id(); + self.reset_flush_state(snapshot_index); + let flush_state = self.flush_state().clone(); + let mut tablet_ctx = TabletContext::new(self.region(), Some(snapshot_index)); + // Use a new FlushState to avoid conflicts with the old one. + tablet_ctx.flush_state = Some(flush_state); + let path = ctx.tablet_registry.tablet_path(region_id, snapshot_index); + assert!( + path.exists(), + "{} {} not exists", + SlogFormat(&self.logger), + path.display() + ); + let tablet = ctx + .tablet_registry + .tablet_factory() + .open_tablet(tablet_ctx, &path) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "failed to load tablet"; + "path" => path.display(), + "error" => ?e + ); + }); + + self.storage_mut().on_applied_snapshot(); + self.raft_group_mut().advance_apply_to(snapshot_index); + if self.proposal_control().is_merging() { + // After applying a snapshot, merge is rollbacked implicitly. + // TODO: self.rollback_merge(ctx); + } + let read_tablet = SharedReadTablet::new(tablet.clone()); + { + let mut meta = ctx.store_meta.lock().unwrap(); + meta.set_region(self.region(), true, &self.logger); + meta.readers + .insert(region_id, (self.generate_read_delegate(), read_tablet)); + meta.region_read_progress + .insert(region_id, self.read_progress().clone()); + } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(ctx, tablet, snapshot_index); + } + self.read_progress_mut().update_applied_core(snapshot_index); + let split = self.storage_mut().split_init_mut().take(); + if split.as_ref().map_or(true, |s| { + !s.scheduled || snapshot_index != RAFT_INIT_LOG_INDEX + }) { + info!(self.logger, "apply tablet snapshot completely"); + SNAP_COUNTER.apply.success.inc(); + } + if let Some(init) = split { + info!(self.logger, "init split with snapshot finished"); + self.post_split_init(ctx, init); + } + self.schedule_apply_fsm(ctx); + if self.remove_tombstone_tablets(snapshot_index) { + let _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::destroy(region_id, snapshot_index)); + } + } + } +} + +impl Apply { + /// Handle snapshot. + /// + /// Will schedule a task to read worker and then generate a snapshot + /// asynchronously. + pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + debug!(self.logger, "scheduling snapshot"; "task" => ?snap_task); + // Do not generate, the peer is removed. + if self.tombstone() { + snap_task.canceled.store(true, Ordering::SeqCst); + error!( + self.logger, + "cancel generating snapshot because it's already destroyed"; + ); + return; + } + // Flush before do snapshot. + if snap_task.canceled.load(Ordering::SeqCst) { + return; + } + self.flush(); + + // Send generate snapshot task to region worker. + let (last_applied_index, last_applied_term) = self.apply_progress(); + snap_task.index.store(last_applied_index, Ordering::SeqCst); + let gen_tablet_sanp_task = ReadTask::GenTabletSnapshot { + region_id: snap_task.region_id, + to_peer: snap_task.to_peer, + tablet: self.tablet().clone(), + region_state: self.region_state().clone(), + last_applied_term, + last_applied_index, + for_balance: snap_task.for_balance, + canceled: snap_task.canceled.clone(), + }; + if let Err(e) = self.read_scheduler().schedule(gen_tablet_sanp_task) { + error!( + self.logger, + "schedule snapshot failed"; + "error" => ?e, + ); + snap_task.canceled.store(true, Ordering::SeqCst); + } + } +} + +impl Storage { + pub fn is_generating_snapshot(&self) -> bool { + let snap_states = self.snap_states.borrow_mut(); + for (_, state) in snap_states.iter() { + if matches!(*state, SnapState::Generating { .. }) { + return true; + } + } + false + } + + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no + /// unavailable snapshot. + pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + if let Some(state) = self.snap_states.borrow_mut().get_mut(&to) { + match state { + SnapState::Generating { ref canceled, .. } => { + if canceled.load(Ordering::SeqCst) { + self.cancel_generating_snap(Some(to)); + } else { + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + } + SnapState::Generated(ref s) => { + let snap = *s.clone(); + *state = SnapState::Relax; + if self.validate_snap(&snap, request_index) { + return Ok(snap); + } + } + _ => {} + }; + } + + if self.has_dirty_data() { + info!(self.logger(), "delay generating snapshot as there are still dirty data"; "request_index" => request_index, "request_peer" => to); + // It's OK to delay. If there are still dirty data, it means the tablet is just + // split. In normal cases, all peers will apply split, so reject generates + // snapshot may actually good for all peers as they are more likely + // to be initialized by split. + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } else { + info!( + self.logger(), + "requesting snapshot"; + "request_index" => request_index, + "request_peer" => to, + ); + } + let canceled = Arc::new(AtomicBool::new(false)); + let index = Arc::new(AtomicU64::new(0)); + let mut gen_snap_task = self.gen_snap_task_mut(); + if gen_snap_task.is_none() { + self.snap_states.borrow_mut().insert( + to, + SnapState::Generating { + canceled: canceled.clone(), + index: index.clone(), + }, + ); + let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); + *gen_snap_task = Box::new(Some(task)); + } + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + } + + /// Validate the snapshot. Returns true if it's valid. + fn validate_snap(&self, snap: &Snapshot, request_index: u64) -> bool { + let idx = snap.get_metadata().get_index(); + if idx < RAFT_INIT_LOG_INDEX || snap.get_metadata().get_term() < RAFT_INIT_LOG_TERM { + info!( + self.logger(), + "corrupted snapshot detected, generate again"; + "snap" => ?snap, + "request_index" => request_index, + ); + return false; + } + // TODO(nolouch): check tuncated index + if idx < request_index { + // stale snapshot, should generate again. + info!( + self.logger(), + "snapshot is stale, generate again"; + "snap_index" => idx, + "request_index" => request_index, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.stale.inc(); + return false; + } + + let mut snap_data = RaftSnapshotData::default(); + if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { + error!( + self.logger(), + "failed to decode snapshot, it may be corrupted"; + "err" => ?e, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.decode.inc(); + return false; + } + let snap_epoch = snap_data.get_region().get_region_epoch(); + let latest_epoch = self.region().get_region_epoch(); + if snap_epoch.get_conf_ver() < latest_epoch.get_conf_ver() { + info!( + self.logger(), + "snapshot epoch is stale"; + "snap_epoch" => ?snap_epoch, + "latest_epoch" => ?latest_epoch, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.epoch.inc(); + return false; + } + + true + } + + pub fn cancel_generating_snap(&self, to_peer: Option) { + if let Some(id) = to_peer { + let mut states = self.snap_states.borrow_mut(); + if let Some(state) = states.get(&id) + && matches!(*state, SnapState::Generating { .. }) + { + info!( + self.logger(), + "snapshot is canceled"; + "to_peer" => to_peer, + ); + self.cancel_snap_task(to_peer); + states.remove(&id); + } + } else { + self.cancel_snap_task(to_peer); + self.snap_states.borrow_mut().clear(); + } + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); + } + + pub fn cancel_generating_snap_due_to_compacted(&self, compact_to: u64) { + let mut states = self.snap_states.borrow_mut(); + states.retain(|id, state| { + let SnapState::Generating { + ref index, + .. + } = *state else { return true; }; + let snap_index = index.load(Ordering::SeqCst); + if snap_index == 0 || compact_to <= snap_index + 1 { + return true; + } + info!( + self.logger(), + "snapshot is canceled"; + "compact_to" => compact_to, + "to_peer" => id, + ); + self.cancel_snap_task(Some(*id)); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); + false + }); + } + + /// Try to switch snap state to generated. only `Generating` can switch to + /// `Generated`. + /// TODO: make the snap state more clearer, the snapshot must be consumed. + pub fn on_snapshot_generated(&self, res: GenSnapRes) -> bool { + if res.is_none() { + self.cancel_generating_snap(None); + return false; + } + let (snapshot, to_peer_id) = *res.unwrap(); + if let Some(state) = self.snap_states.borrow_mut().get_mut(&to_peer_id) { + let SnapState::Generating { + ref index, + .. + } = *state else { return false }; + if snapshot.get_metadata().get_index() < index.load(Ordering::SeqCst) { + warn!( + self.logger(), + "snapshot is staled, skip"; + "snap index" => snapshot.get_metadata().get_index(), + "required index" => index.load(Ordering::SeqCst), + "to_peer_id" => to_peer_id, + ); + return false; + } + *state = SnapState::Generated(Box::new(snapshot)); + } + true + } + + pub fn on_applied_snapshot(&mut self) { + let entry = self.entry_storage_mut(); + let term = entry.truncated_term(); + let index = entry.truncated_index(); + entry.set_applied_term(term); + entry.apply_state_mut().set_applied_index(index); + self.apply_trace_mut().on_applied_snapshot(index); + } + + pub fn apply_snapshot( + &mut self, + snap: &Snapshot, + task: &mut WriteTask, + snap_mgr: &TabletSnapManager, + reg: &TabletRegistry, + key_manager: Option<&Arc>, + ) -> Result<()> { + let region_id = self.region().get_id(); + let peer_id = self.peer().get_id(); + info!( + self.logger(), + "begin to apply snapshot"; + ); + + let mut snap_data = RaftSnapshotData::default(); + snap_data.merge_from_bytes(snap.get_data())?; + let region = snap_data.take_region(); + let removed_records = snap_data.take_removed_records(); + let merged_records = snap_data.take_merged_records(); + if region.get_id() != region_id { + return Err(box_err!( + "mismatch region id {}!={}", + region_id, + region.get_id() + )); + } + + let old_last_index = self.entry_storage().last_index(); + if self.entry_storage().first_index() <= old_last_index { + // All states are rewritten in the following blocks. Stale states will be + // cleaned up by compact worker. Have to use raft write batch here becaue + // raft log engine expects deletes before writes. + let raft_engine = self.entry_storage().raft_engine(); + if task.raft_wb.is_none() { + task.raft_wb = Some(raft_engine.log_batch(64)); + } + let wb = task.raft_wb.as_mut().unwrap(); + raft_engine + .clean(region.get_id(), 0, self.entry_storage().raft_state(), wb) + .unwrap_or_else(|e| { + slog_panic!( + self.logger(), + "failed to clean up region"; + "error" => ?e + ) + }); + self.entry_storage_mut().clear(); + } + + let last_index = snap.get_metadata().get_index(); + let last_term = snap.get_metadata().get_term(); + assert!( + last_index >= RAFT_INIT_LOG_INDEX && last_term >= RAFT_INIT_LOG_TERM, + "{}", + SlogFormat(self.logger()) + ); + let mut region_state = self.region_state().clone(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region); + region_state.set_removed_records(removed_records); + region_state.set_merged_records(merged_records); + region_state.set_tablet_index(last_index); + // We need set_region_state here to update the peer. + self.set_region_state(region_state); + + let entry_storage = self.entry_storage_mut(); + entry_storage.raft_state_mut().set_last_index(last_index); + entry_storage.set_truncated_index(last_index); + entry_storage.set_truncated_term(last_term); + entry_storage.set_last_term(last_term); + + self.apply_trace_mut().restore_snapshot(last_index); + self.set_ever_persisted(); + let lb = task + .extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(3)); + lb.put_apply_state(region_id, last_index, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, last_index, self.region_state()) + .unwrap(); + // We assume there should be flush records in all CFs. Skip any CF here may + // break the constraint. + for cf in ALL_CFS { + lb.put_flushed_index(region_id, cf, last_index, last_index) + .unwrap(); + } + + let (path, clean_split) = match self.split_init_mut() { + // If index not match, the peer may accept a newer snapshot after split. + Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { + lb.put_dirty_mark(region_id, last_index, true).unwrap(); + self.set_has_dirty_data(true); + (temp_split_path(reg, region_id), false) + } + si => ( + recv_snap_path(snap_mgr, region_id, peer_id, last_term, last_index), + si.is_some(), + ), + }; + + let logger = self.logger().clone(); + // The snapshot require no additional processing such as ingest them to DB, but + // it should load it into the factory after it persisted. + let reg = reg.clone(); + let key_manager = key_manager.cloned(); + let hook = move || { + if !install_tablet(®, key_manager.as_deref(), &path, region_id, last_index) { + slog_panic!( + logger, + "failed to install tablet"; + "path" => %path.display(), + "tablet_index" => last_index + ); + } + if clean_split { + let path = temp_split_path(®, region_id); + // TODO(tabokie) + let _ = fs::remove_dir_all(path); + } + }; + task.persisted_cbs.push(Box::new(hook)); + task.has_snapshot = true; + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs new file mode 100644 index 00000000000..272b2526b39 --- /dev/null +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -0,0 +1,273 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains everything related to transaction hook. +//! +//! This is the temporary (efficient) solution, it should be implemented as one +//! type of coprocessor. + +use std::sync::{atomic::Ordering, Arc}; + +use crossbeam::atomic::AtomicCell; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use parking_lot::RwLockWriteGuard; +use raft::eraftpb; +use raftstore::store::{ + LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, +}; +use slog::{error, info, Logger}; + +use crate::{ + batch::StoreContext, + raft::Peer, + router::{PeerMsg, PeerTick}, + worker::pd, + SimpleWriteEncoder, +}; + +pub struct TxnContext { + ext: Arc, + extra_op: Arc>, + reactivate_memory_lock_ticks: usize, +} + +impl Default for TxnContext { + #[inline] + fn default() -> Self { + Self { + ext: Arc::default(), + extra_op: Arc::new(AtomicCell::new(ExtraOp::Noop)), + reactivate_memory_lock_ticks: 0, + } + } +} + +impl TxnContext { + #[inline] + pub fn on_region_changed(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_leader( + &self, + ctx: &mut StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) { + // A more recent read may happen on the old leader. So max ts should + // be updated after a peer becomes leader. + self.require_updating_max_ts(ctx, term, region, logger); + + // Init the in-memory pessimistic lock table when the peer becomes leader. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::Normal; + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn after_commit_merge( + &self, + ctx: &StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) { + // If a follower merges into a leader, a more recent read may happen + // on the leader of the follower. So max ts should be updated after + // a region merge. + self.require_updating_max_ts(ctx, term, region, logger); + } + + #[inline] + pub fn on_became_follower(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::NotLeader; + pessimistic_locks.clear(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn ext(&self) -> &Arc { + &self.ext + } + + #[inline] + pub fn extra_op(&self) -> &Arc> { + &self.extra_op + } + + fn require_updating_max_ts( + &self, + ctx: &StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let epoch = region.get_region_epoch(); + let term_low_bits = term & ((1 << 32) - 1); // 32 bits + let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits + let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); + self.ext + .max_ts_sync_status + .store(initial_status, Ordering::SeqCst); + info!( + logger, + "require updating max ts"; + "initial_status" => initial_status, + ); + let task = pd::Task::UpdateMaxTimestamp { + region_id: region.get_id(), + initial_status, + txn_ext: self.ext.clone(), + }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!(logger, "failed to notify pd with UpdateMaxTimestamp"; "err" => ?e); + } + } + + pub fn split(&self, regions: &[Region], derived: &Region) -> Vec { + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + // Update the version so the concurrent reader will fail due to EpochNotMatch + // instead of PessimisticLockNotFound. + pessimistic_locks.version = derived.get_region_epoch().get_version(); + pessimistic_locks.group_by_regions(regions, derived) + } + + pub fn init_with_lock(&self, locks: PeerPessimisticLocks) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + *pessimistic_locks = locks; + } +} + +impl Peer { + /// Returns True means the tick is consumed, otherwise the tick should be + /// rescheduled. + pub fn on_reactivate_memory_lock_tick(&mut self, ctx: &mut StoreContext) { + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. + if !self.is_leader() { + return; + } + + let transferring_leader = self.raft_group().raft.lead_transferee.is_some(); + let txn_context = self.txn_context_mut(); + let mut pessimistic_locks = txn_context.ext.pessimistic_locks.write(); + + // And this tick is currently only used for the leader transfer failure case. + if pessimistic_locks.status != LocksStatus::TransferringLeader { + return; + } + + txn_context.reactivate_memory_lock_ticks += 1; + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. + if !transferring_leader + && txn_context.reactivate_memory_lock_ticks >= ctx.cfg.reactive_memory_lock_timeout_tick + { + pessimistic_locks.status = LocksStatus::Normal; + txn_context.reactivate_memory_lock_ticks = 0; + } else { + drop(pessimistic_locks); + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + } + } + + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. + pub fn propose_locks_before_transfer_leader( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + ) -> bool { + // 1. Disable in-memory pessimistic locks. + + // Clone to make borrow checker happy when registering ticks. + let txn_ext = self.txn_context().ext.clone(); + let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); + + // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message + // is a reply to a transfer leader command before. If the locks status remain + // in the TransferringLeader status, we can safely initiate transferring leader + // now. + // If it's not in TransferringLeader status now, it is probably because several + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. + if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX + && pessimistic_locks.status == LocksStatus::TransferringLeader + { + return false; + } + + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. + if !pessimistic_locks.is_writable() { + return true; + } + pessimistic_locks.status = LocksStatus::TransferringLeader; + self.txn_context_mut().reactivate_memory_lock_ticks = 0; + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + + // 2. Propose pessimistic locks + if pessimistic_locks.is_empty() { + return false; + } + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let mut lock_count = 0; + { + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. + let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); + fail::fail_point!("invalidate_locks_before_transfer_leader"); + for (key, (lock, deleted)) in &*pessimistic_locks { + if *deleted { + continue; + } + lock_count += 1; + encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); + } + } + if lock_count == 0 { + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. + return true; + } + let mut header = Box::::default(); + header.set_region_id(self.region_id()); + header.set_region_epoch(self.region().get_region_epoch().clone()); + header.set_peer(self.peer().clone()); + info!( + self.logger, + "propose {} locks before transferring leader", lock_count; + ); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + self.on_simple_write(ctx, write.header, write.data, write.ch); + true + } +} diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs new file mode 100644 index 00000000000..d32b8bdbb80 --- /dev/null +++ b/components/raftstore-v2/src/raft/apply.rs @@ -0,0 +1,311 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{mem, sync::Arc}; + +use engine_traits::{ + FlushState, KvEngine, PerfContextKind, TabletRegistry, WriteBatch, DATA_CFS_LEN, +}; +use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; +use pd_client::BucketStat; +use raftstore::{ + coprocessor::{Cmd, CmdObserveInfo, CoprocessorHost, ObserveLevel}, + store::{ + fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, + Config, ReadTask, + }, +}; +use slog::Logger; +use sst_importer::SstImporter; +use tikv_util::{log::SlogFormat, worker::Scheduler}; + +use crate::{ + operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, + router::CmdResChannel, +}; + +pub(crate) struct Observe { + pub info: CmdObserveInfo, + pub level: ObserveLevel, + pub cmds: Vec, +} + +/// Apply applies all the committed commands to kv db. +pub struct Apply { + peer: metapb::Peer, + tablet: EK, + perf_context: EK::PerfContext, + pub write_batch: Option, + /// A buffer for encoding key. + pub key_buffer: Vec, + + tablet_registry: TabletRegistry, + + callbacks: Vec<(Vec, RaftCmdResponse)>, + + flow_control: ApplyFlowControl, + + /// A flag indicates whether the peer is destroyed by applying admin + /// command. + tombstone: bool, + applied_term: u64, + // Apply progress is set after every command in case there is a flush. But it's + // wrong to update flush_state immediately as a manual flush from other thread + // can fetch the wrong apply index from flush_state. + applied_index: u64, + /// The largest index that have modified each column family. + modifications: DataTrace, + admin_cmd_result: Vec, + flush_state: Arc, + /// The flushed indexes of each column family before being restarted. + /// + /// If an apply index is less than the flushed index, the log can be + /// skipped. `None` means logs should apply to all required column + /// families. + log_recovery: Option>, + + region_state: RegionLocalState, + + res_reporter: R, + read_scheduler: Scheduler>, + sst_importer: Arc, + observe: Observe, + coprocessor_host: CoprocessorHost, + + pub(crate) metrics: ApplyMetrics, + pub(crate) logger: Logger, + pub(crate) buckets: Option, +} + +impl Apply { + #[inline] + pub fn new( + cfg: &Config, + peer: metapb::Peer, + region_state: RegionLocalState, + res_reporter: R, + tablet_registry: TabletRegistry, + read_scheduler: Scheduler>, + flush_state: Arc, + log_recovery: Option>, + applied_term: u64, + buckets: Option, + sst_importer: Arc, + coprocessor_host: CoprocessorHost, + logger: Logger, + ) -> Self { + let mut remote_tablet = tablet_registry + .get(region_state.get_region().get_id()) + .unwrap(); + assert_ne!(applied_term, 0, "{}", SlogFormat(&logger)); + let applied_index = flush_state.applied_index(); + assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); + let tablet = remote_tablet.latest().unwrap().clone(); + let perf_context = EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); + Apply { + peer, + tablet, + perf_context, + write_batch: None, + callbacks: vec![], + flow_control: ApplyFlowControl::new(cfg), + tombstone: false, + applied_term, + applied_index: flush_state.applied_index(), + modifications: [0; DATA_CFS_LEN], + admin_cmd_result: vec![], + region_state, + tablet_registry, + read_scheduler, + key_buffer: vec![], + res_reporter, + flush_state, + log_recovery, + metrics: ApplyMetrics::default(), + buckets, + sst_importer, + observe: Observe { + info: CmdObserveInfo::default(), + level: ObserveLevel::None, + cmds: vec![], + }, + coprocessor_host, + logger, + } + } + + #[inline] + pub fn tablet_registry(&self) -> &TabletRegistry { + &self.tablet_registry + } + + #[inline] + pub fn res_reporter(&self) -> &R { + &self.res_reporter + } + + #[inline] + pub fn callbacks_mut(&mut self) -> &mut Vec<(Vec, RaftCmdResponse)> { + &mut self.callbacks + } + + #[inline] + pub fn ensure_write_buffer(&mut self) { + if self.write_batch.is_some() { + return; + } + self.write_batch = Some(self.tablet.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE)); + } + + #[inline] + pub fn set_apply_progress(&mut self, index: u64, term: u64) { + self.applied_index = index; + self.applied_term = term; + if self.log_recovery.is_none() { + return; + } + let log_recovery = self.log_recovery.as_ref().unwrap(); + if log_recovery.iter().all(|v| index >= *v) { + self.log_recovery.take(); + } + } + + #[inline] + pub fn apply_progress(&self) -> (u64, u64) { + (self.applied_index, self.applied_term) + } + + #[inline] + pub fn read_scheduler(&self) -> &Scheduler> { + &self.read_scheduler + } + + #[inline] + pub fn region_state(&self) -> &RegionLocalState { + &self.region_state + } + + #[inline] + pub fn region_state_mut(&mut self) -> &mut RegionLocalState { + &mut self.region_state + } + + #[inline] + pub fn region(&self) -> &metapb::Region { + self.region_state.get_region() + } + + #[inline] + pub fn region_id(&self) -> u64 { + self.region().get_id() + } + + /// The tablet can't be public yet, otherwise content of latest tablet + /// doesn't matches its epoch in both readers and peer fsm. + #[inline] + pub fn set_tablet(&mut self, tablet: EK) { + assert!( + self.write_batch.as_ref().map_or(true, |wb| wb.is_empty()), + "{} setting tablet while still have dirty write batch", + SlogFormat(&self.logger) + ); + self.write_batch.take(); + self.tablet = tablet; + } + + #[inline] + pub fn tablet(&self) -> &EK { + &self.tablet + } + + #[inline] + pub fn perf_context(&mut self) -> &mut EK::PerfContext { + &mut self.perf_context + } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + &self.peer + } + + #[inline] + pub fn set_peer(&mut self, peer: metapb::Peer) { + self.peer = peer; + } + + #[inline] + pub fn mark_tombstone(&mut self) { + self.tombstone = true; + } + + #[inline] + pub fn tombstone(&self) -> bool { + self.tombstone + } + + #[inline] + pub fn push_admin_result(&mut self, admin_result: AdminCmdResult) { + self.admin_cmd_result.push(admin_result); + } + + #[inline] + pub fn take_admin_result(&mut self) -> Vec { + mem::take(&mut self.admin_cmd_result) + } + + #[inline] + pub fn release_memory(&mut self) { + mem::take(&mut self.key_buffer); + if self.write_batch.as_ref().map_or(false, |wb| wb.is_empty()) { + self.write_batch = None; + } + } + + #[inline] + pub fn modifications_mut(&mut self) -> &mut DataTrace { + &mut self.modifications + } + + #[inline] + pub fn flush_state(&self) -> &Arc { + &self.flush_state + } + + #[inline] + pub fn log_recovery(&self) -> &Option> { + &self.log_recovery + } + + #[inline] + pub fn apply_flow_control_mut(&mut self) -> &mut ApplyFlowControl { + &mut self.flow_control + } + + pub fn apply_flow_control(&self) -> &ApplyFlowControl { + &self.flow_control + } + + #[inline] + pub fn sst_importer(&self) -> &SstImporter { + &self.sst_importer + } + + #[inline] + pub(crate) fn observe(&mut self) -> &Observe { + &self.observe + } + + #[inline] + pub(crate) fn observe_mut(&mut self) -> &mut Observe { + &mut self.observe + } + + #[inline] + pub fn term(&self) -> u64 { + self.applied_term + } + + #[inline] + pub fn coprocessor_host(&self) -> &CoprocessorHost { + &self.coprocessor_host + } +} diff --git a/components/raftstore-v2/src/raft/mod.rs b/components/raftstore-v2/src/raft/mod.rs new file mode 100644 index 00000000000..495d7ad87ed --- /dev/null +++ b/components/raftstore-v2/src/raft/mod.rs @@ -0,0 +1,9 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod apply; +mod peer; +mod storage; + +pub use apply::Apply; +pub use peer::Peer; +pub use storage::Storage; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs new file mode 100644 index 00000000000..e11c96922cd --- /dev/null +++ b/components/raftstore-v2/src/raft/peer.rs @@ -0,0 +1,938 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cmp, mem, + sync::Arc, + time::{Duration, Instant}, +}; + +use collections::{HashMap, HashSet}; +use encryption_export::DataKeyManager; +use engine_traits::{ + CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, +}; +use kvproto::{ + metapb::{self, PeerRole}, + pdpb, + raft_serverpb::{RaftMessage, RegionLocalState}, +}; +use raft::{RawNode, StateRole}; +use raftstore::{ + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, + store::{ + fsm::ApplyMetrics, + metrics::RAFT_PEER_PENDING_DURATION, + util::{Lease, RegionReadProgress}, + Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, + TabletSnapManager, WriteTask, + }, +}; +use slog::{debug, info, Logger}; +use tikv_util::{slog_panic, time::duration_to_sec}; + +use super::storage::Storage; +use crate::{ + fsm::ApplyScheduler, + operation::{ + AbnormalPeerContext, AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, + GcPeerContext, MergeContext, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + TxnContext, + }, + router::{ApplyTask, CmdResChannel, PeerTick, QueryResChannel}, + Result, +}; + +const REGION_READ_PROGRESS_CAP: usize = 128; + +/// A peer that delegates commands between state machine and raft. +pub struct Peer { + raft_group: RawNode>, + tablet: CachedTablet, + tablet_being_flushed: bool, + + /// Statistics for self. + self_stat: PeerStat, + + /// We use a cache for looking up peers. Not all peers exist in region's + /// peer list, for example, an isolated peer may need to send/receive + /// messages with unknown peers after recovery. + peer_cache: Vec, + /// Statistics for other peers, only maintained when self is the leader. + peer_heartbeats: HashMap, + + /// For raft log compaction. + compact_log_context: CompactLogContext, + + merge_context: Option>, + last_sent_snapshot_index: u64, + + /// Encoder for batching proposals and encoding them in a more efficient way + /// than protobuf. + raw_write_encoder: Option, + proposals: ProposalQueue>, + apply_scheduler: Option, + + /// Set to true if any side effect needs to be handled. + has_ready: bool, + /// Sometimes there is no ready at all, but we need to trigger async write. + has_extra_write: bool, + pause_for_recovery: bool, + /// Writer for persisting side effects asynchronously. + pub(crate) async_writer: AsyncWriter, + + destroy_progress: DestroyProgress, + + pub(crate) logger: Logger, + pending_reads: ReadIndexQueue, + read_progress: Arc, + leader_lease: Lease, + + region_buckets_info: BucketStatsInfo, + + /// Transaction extensions related to this peer. + txn_context: TxnContext, + + pending_ticks: Vec, + + /// Check whether this proposal can be proposed based on its epoch. + proposal_control: ProposalControl, + + // Trace which peers have not finished split. + split_trace: Vec<(u64, HashSet)>, + split_flow_control: SplitFlowControl, + + /// Apply related State changes that needs to be persisted to raft engine. + /// + /// To make recovery correct, we need to persist all state changes before + /// advancing apply index. + state_changes: Option>, + flush_state: Arc, + + /// lead_transferee if this peer(leader) is in a leadership transferring. + leader_transferee: u64, + + long_uncommitted_threshold: u64, + + /// Pending messages to be sent on handle ready. We should avoid sending + /// messages immediately otherwise it may break the persistence assumption. + pending_messages: Vec, + + gc_peer_context: GcPeerContext, + + abnormal_peer_context: AbnormalPeerContext, +} + +impl Peer { + /// Creates a new peer. + /// + /// If peer is destroyed, `None` is returned. + pub fn new( + cfg: &Config, + tablet_registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, + snap_mgr: &TabletSnapManager, + storage: Storage, + ) -> Result { + let logger = storage.logger().clone(); + + let applied_index = storage.apply_state().get_applied_index(); + let peer_id = storage.peer().get_id(); + let raft_cfg = cfg.new_raft_config(peer_id, applied_index); + + let region_id = storage.region().get_id(); + let tablet_index = storage.region_state().get_tablet_index(); + let merge_context = MergeContext::from_region_state(&logger, storage.region_state()); + + let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; + let region = raft_group.store().region_state().get_region().clone(); + + let flush_state: Arc = Arc::new(FlushState::new(applied_index)); + // We can't create tablet if tablet index is 0. It can introduce race when gc + // old tablet and create new peer. We also can't get the correct range of the + // region, which is required for kv data gc. + if tablet_index != 0 { + raft_group + .store() + .recover_tablet(tablet_registry, key_manager, snap_mgr); + let mut ctx = TabletContext::new(®ion, Some(tablet_index)); + ctx.flush_state = Some(flush_state.clone()); + // TODO: Perhaps we should stop create the tablet automatically. + tablet_registry.load(ctx, false)?; + } + let cached_tablet = tablet_registry.get_or_default(region_id); + + let tag = format!("[region {}] {}", region.get_id(), peer_id); + let mut peer = Peer { + tablet: cached_tablet, + tablet_being_flushed: false, + self_stat: PeerStat::default(), + peer_cache: vec![], + peer_heartbeats: HashMap::default(), + compact_log_context: CompactLogContext::new(applied_index), + merge_context: merge_context.map(|c| Box::new(c)), + last_sent_snapshot_index: 0, + raw_write_encoder: None, + proposals: ProposalQueue::new(region_id, raft_group.raft.id), + async_writer: AsyncWriter::new(region_id, peer_id), + apply_scheduler: None, + has_ready: false, + has_extra_write: false, + pause_for_recovery: false, + destroy_progress: DestroyProgress::None, + raft_group, + logger, + pending_reads: ReadIndexQueue::new(tag), + read_progress: Arc::new(RegionReadProgress::new( + ®ion, + applied_index, + REGION_READ_PROGRESS_CAP, + peer_id, + )), + leader_lease: Lease::new( + cfg.raft_store_max_leader_lease(), + cfg.renew_leader_lease_advance_duration(), + ), + region_buckets_info: BucketStatsInfo::default(), + txn_context: TxnContext::default(), + proposal_control: ProposalControl::new(0), + pending_ticks: Vec::new(), + split_trace: vec![], + state_changes: None, + flush_state, + split_flow_control: SplitFlowControl::default(), + leader_transferee: raft::INVALID_ID, + long_uncommitted_threshold: cmp::max( + cfg.long_uncommitted_base_threshold.0.as_secs(), + 1, + ), + pending_messages: vec![], + gc_peer_context: GcPeerContext::default(), + abnormal_peer_context: AbnormalPeerContext::default(), + }; + + // If this region has only one peer and I am the one, campaign directly. + let region = peer.region(); + if region.get_peers().len() == 1 + && region.get_peers()[0] == *peer.peer() + && tablet_index != 0 + { + peer.raft_group.campaign()?; + peer.set_has_ready(); + } + let term = peer.term(); + peer.proposal_control.maybe_update_term(term); + + Ok(peer) + } + + pub fn region_buckets_info_mut(&mut self) -> &mut BucketStatsInfo { + &mut self.region_buckets_info + } + + pub fn region_buckets_info(&self) -> &BucketStatsInfo { + &self.region_buckets_info + } + + #[inline] + pub fn region(&self) -> &metapb::Region { + self.raft_group.store().region() + } + + #[inline] + pub fn region_id(&self) -> u64 { + self.region().get_id() + } + + /// Set the region of a peer. + /// + /// This will update the region of the peer, caller must ensure the region + /// has been preserved in a durable device. + pub fn set_region( + &mut self, + host: &CoprocessorHost, + reader: &mut ReadDelegate, + region: metapb::Region, + reason: RegionChangeReason, + tablet_index: u64, + ) { + if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() + { + // Epoch version changed, disable read on the local reader for this region. + self.leader_lease.expire_remote_lease(); + } + + let mut region_state = RegionLocalState::default(); + region_state.set_region(region.clone()); + region_state.set_tablet_index(tablet_index); + region_state.set_state(self.storage().region_state().get_state()); + self.storage_mut().set_region_state(region_state); + + let progress = ReadProgress::region(region); + // Always update read delegate's region to avoid stale region info after a + // follower becoming a leader. + self.maybe_update_read_progress(reader, progress); + + if self.is_leader() { + // Unlike v1, we should renew remote lease if it's leader. This is because v2 + // only provides read in local reader which requires passing the lease check. If + // lease check fails, it sends query to raftstore to make it renew the remote + // lease. However, raftstore will answer immediately if the `bound` in + // `leader_lease` is valid, so the remote lease will not be updated. + if let Some(progress) = self + .leader_lease + .maybe_new_remote_lease(self.term()) + .map(ReadProgress::leader_lease) + { + self.maybe_update_read_progress(reader, progress); + } + } + + // Update leader info + self.read_progress + .update_leader_info(self.leader_id(), self.term(), self.region()); + + self.txn_context + .on_region_changed(self.term(), self.region()); + + if self.serving() { + host.on_region_changed( + self.region(), + RegionChangeEvent::Update(reason), + self.state_role(), + ); + } + } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + self.raft_group.store().peer() + } + + #[inline] + pub fn peer_id(&self) -> u64 { + self.peer().get_id() + } + + #[inline] + pub fn tablet_being_flushed(&self) -> bool { + self.tablet_being_flushed + } + + #[inline] + pub fn set_tablet_being_flushed(&mut self, v: bool) { + self.tablet_being_flushed = v; + } + + #[inline] + pub fn storage(&self) -> &Storage { + self.raft_group.store() + } + + #[inline] + pub fn read_progress(&self) -> &Arc { + &self.read_progress + } + + #[inline] + pub fn read_progress_mut(&mut self) -> &mut Arc { + &mut self.read_progress + } + + #[inline] + pub fn leader_lease(&self) -> &Lease { + &self.leader_lease + } + + #[inline] + pub fn leader_lease_mut(&mut self) -> &mut Lease { + &mut self.leader_lease + } + + #[inline] + pub fn storage_mut(&mut self) -> &mut Storage { + self.raft_group.mut_store() + } + + #[inline] + pub fn pending_reads(&self) -> &ReadIndexQueue { + &self.pending_reads + } + + #[inline] + pub fn pending_reads_mut(&mut self) -> &mut ReadIndexQueue { + &mut self.pending_reads + } + + #[inline] + pub fn entry_storage(&self) -> &EntryStorage { + self.raft_group.store().entry_storage() + } + + #[inline] + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + self.raft_group.mut_store().entry_storage_mut() + } + + #[inline] + pub fn tablet(&mut self) -> Option<&EK> { + self.tablet.latest() + } + + #[inline] + pub fn set_tablet(&mut self, tablet: EK) -> Option { + self.tablet.set(tablet) + } + + #[inline] + pub fn compact_log_context_mut(&mut self) -> &mut CompactLogContext { + &mut self.compact_log_context + } + + #[inline] + pub fn compact_log_context(&self) -> &CompactLogContext { + &self.compact_log_context + } + + #[inline] + pub fn merge_context(&self) -> Option<&MergeContext> { + self.merge_context.as_deref() + } + + #[inline] + pub fn merge_context_mut(&mut self) -> &mut MergeContext { + self.merge_context.get_or_insert_default() + } + + #[inline] + pub fn take_merge_context(&mut self) -> Option> { + self.merge_context.take() + } + + #[inline] + pub fn raft_group(&self) -> &RawNode> { + &self.raft_group + } + + #[inline] + pub fn raft_group_mut(&mut self) -> &mut RawNode> { + &mut self.raft_group + } + + #[inline] + pub fn set_raft_group(&mut self, raft_group: RawNode>) { + self.raft_group = raft_group; + } + + #[inline] + pub fn persisted_index(&self) -> u64 { + self.raft_group.raft.raft_log.persisted + } + + #[inline] + pub fn self_stat(&self) -> &PeerStat { + &self.self_stat + } + + #[inline] + pub fn update_stat(&mut self, metrics: &ApplyMetrics) { + self.self_stat.written_bytes += metrics.written_bytes; + self.self_stat.written_keys += metrics.written_keys; + } + + /// Mark the peer has a ready so it will be checked at the end of every + /// processing round. + #[inline] + pub fn set_has_ready(&mut self) { + self.has_ready = true; + } + + /// Mark the peer has no ready and return its previous state. + #[inline] + pub fn reset_has_ready(&mut self) -> bool { + mem::take(&mut self.has_ready) + } + + #[inline] + pub fn set_has_extra_write(&mut self) { + self.set_has_ready(); + self.has_extra_write = true; + } + + #[inline] + pub fn reset_has_extra_write(&mut self) -> bool { + mem::take(&mut self.has_extra_write) + } + + #[inline] + pub fn set_pause_for_recovery(&mut self, pause: bool) { + self.pause_for_recovery = pause; + } + + #[inline] + pub fn pause_for_recovery(&self) -> bool { + self.pause_for_recovery + } + + #[inline] + // we may have skipped scheduling raft tick when start due to noticable gap + // between commit index and apply index. We should scheduling it when raft log + // apply catches up. + pub fn try_compelete_recovery(&mut self) { + if self.pause_for_recovery() + && self.storage().entry_storage().commit_index() + <= self.storage().entry_storage().applied_index() + { + info!( + self.logger, + "recovery completed"; + "apply_index" => self.storage().entry_storage().applied_index() + ); + self.set_pause_for_recovery(false); + // Flush to avoid recover again and again. + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + self.add_pending_tick(PeerTick::Raft); + } + } + + #[inline] + pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { + for p in self.raft_group.store().region().get_peers() { + if p.get_id() == peer.get_id() { + return; + } + } + for p in &mut self.peer_cache { + if p.get_id() == peer.get_id() { + *p = peer; + return; + } + } + self.peer_cache.push(peer); + } + + #[inline] + pub fn clear_peer_cache(&mut self) { + self.peer_cache.clear(); + } + + #[inline] + pub fn peer_from_cache(&self, peer_id: u64) -> Option { + for p in self.raft_group.store().region().get_peers() { + if p.get_id() == peer_id { + return Some(p.clone()); + } + } + self.peer_cache + .iter() + .find(|p| p.get_id() == peer_id) + .cloned() + } + + #[inline] + pub fn update_peer_statistics(&mut self) { + if !self.is_leader() { + self.peer_heartbeats.clear(); + return; + } + + if self.peer_heartbeats.len() == self.region().get_peers().len() { + return; + } + + // Insert heartbeats in case that some peers never response heartbeats. + let region = self.raft_group.store().region(); + for peer in region.get_peers() { + self.peer_heartbeats + .entry(peer.get_id()) + .or_insert_with(Instant::now); + } + } + + #[inline] + pub fn add_peer_heartbeat(&mut self, peer_id: u64, now: Instant) { + self.peer_heartbeats.insert(peer_id, now); + } + + #[inline] + pub fn remove_peer_heartbeat(&mut self, peer_id: u64) { + self.peer_heartbeats.remove(&peer_id); + } + + /// Returns whether or not the peer sent heartbeat after the provided + /// deadline time. + #[inline] + pub fn peer_heartbeat_is_fresh(&self, peer_id: u64, deadline: &Instant) -> bool { + matches!( + self.peer_heartbeats.get(&peer_id), + Some(last_heartbeat) if *last_heartbeat >= *deadline + ) + } + + pub fn collect_down_peers(&mut self, max_duration: Duration) -> Vec { + let mut down_peers = Vec::new(); + let mut down_peer_ids = Vec::new(); + let now = Instant::now(); + for p in self.region().get_peers() { + if p.get_id() == self.peer_id() { + continue; + } + if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { + let elapsed = instant.saturating_duration_since(now); + if elapsed >= max_duration { + let mut stats = pdpb::PeerStats::default(); + stats.set_peer(p.clone()); + stats.set_down_seconds(elapsed.as_secs()); + down_peers.push(stats); + down_peer_ids.push(p.get_id()); + } + } + } + *self.abnormal_peer_context_mut().down_peers_mut() = down_peer_ids; + // TODO: `refill_disk_full_peers` + down_peers + } + + #[inline] + pub fn state_role(&self) -> StateRole { + self.raft_group.raft.state + } + + #[inline] + pub fn is_leader(&self) -> bool { + self.raft_group.raft.state == StateRole::Leader + } + + #[inline] + pub fn leader_id(&self) -> u64 { + self.raft_group.raft.leader_id + } + + /// Get the leader peer meta. + /// + /// `None` is returned if there is no leader or the meta can't be found. + #[inline] + pub fn leader(&self) -> Option { + let leader_id = self.leader_id(); + if leader_id != 0 { + self.peer_from_cache(leader_id) + } else { + None + } + } + + /// Term of the state machine. + #[inline] + pub fn term(&self) -> u64 { + self.raft_group.raft.term + } + + #[inline] + // TODO + pub fn has_force_leader(&self) -> bool { + false + } + + pub fn serving(&self) -> bool { + matches!(self.destroy_progress, DestroyProgress::None) + } + + #[inline] + pub fn destroy_progress(&self) -> &DestroyProgress { + &self.destroy_progress + } + + #[inline] + pub fn destroy_progress_mut(&mut self) -> &mut DestroyProgress { + &mut self.destroy_progress + } + + #[inline] + pub fn simple_write_encoder_mut(&mut self) -> &mut Option { + &mut self.raw_write_encoder + } + + #[inline] + pub fn simple_write_encoder(&self) -> &Option { + &self.raw_write_encoder + } + + #[inline] + pub fn applied_to_current_term(&self) -> bool { + self.storage().entry_storage().applied_term() == self.term() + } + + #[inline] + pub fn proposals_mut(&mut self) -> &mut ProposalQueue> { + &mut self.proposals + } + + #[inline] + pub fn proposals(&self) -> &ProposalQueue> { + &self.proposals + } + + pub fn apply_scheduler(&self) -> Option<&ApplyScheduler> { + self.apply_scheduler.as_ref() + } + + #[inline] + pub fn set_apply_scheduler(&mut self, apply_scheduler: ApplyScheduler) { + self.apply_scheduler = Some(apply_scheduler); + } + + #[inline] + pub fn clear_apply_scheduler(&mut self) { + self.apply_scheduler.take(); + } + + /// Whether the snapshot is handling. + /// See the comments of `check_snap_status` for more details. + #[inline] + pub fn is_handling_snapshot(&self) -> bool { + self.persisted_index() < self.entry_storage().truncated_index() + } + + /// Returns `true` if the raft group has replicated a snapshot but not + /// committed it yet. + #[inline] + pub fn has_pending_snapshot(&self) -> bool { + self.raft_group().snap().is_some() + } + + #[inline] + pub fn add_pending_tick(&mut self, tick: PeerTick) { + // Msg per batch is 4096/256 by default, the buffer won't grow too large. + self.pending_ticks.push(tick); + } + + #[inline] + pub fn take_pending_ticks(&mut self) -> Vec { + mem::take(&mut self.pending_ticks) + } + + #[inline] + pub fn post_split(&mut self) { + self.region_buckets_info_mut().set_bucket_stat(None); + } + + pub fn maybe_campaign(&mut self) -> bool { + if self.region().get_peers().len() <= 1 { + // The peer campaigned when it was created, no need to do it again. + return false; + } + + // If last peer is the leader of the region before split, it's intuitional for + // it to become the leader of new split region. + let _ = self.raft_group.campaign(); + true + } + + #[inline] + pub fn txn_context(&self) -> &TxnContext { + &self.txn_context + } + + #[inline] + pub fn txn_context_mut(&mut self) -> &mut TxnContext { + &mut self.txn_context + } + + pub fn generate_read_delegate(&self) -> ReadDelegate { + let peer_id = self.peer().get_id(); + + ReadDelegate::new( + peer_id, + self.term(), + self.region().clone(), + self.storage().entry_storage().applied_term(), + self.txn_context.extra_op().clone(), + self.txn_context.ext().clone(), + self.read_progress().clone(), + self.region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), + ) + } + + #[inline] + pub fn proposal_control_mut(&mut self) -> &mut ProposalControl { + &mut self.proposal_control + } + + #[inline] + pub fn proposal_control(&self) -> &ProposalControl { + &self.proposal_control + } + + #[inline] + pub fn proposal_control_advance_apply(&mut self, apply_index: u64) { + let region = self.raft_group.store().region(); + let term = self.term(); + self.proposal_control + .advance_apply(apply_index, term, region); + } + + #[inline] + pub fn in_joint_state(&self) -> bool { + self.region().get_peers().iter().any(|p| { + p.get_role() == PeerRole::IncomingVoter || p.get_role() == PeerRole::DemotingVoter + }) + } + + #[inline] + pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { + &mut self.split_trace + } + + #[inline] + pub fn flush_state(&self) -> &Arc { + &self.flush_state + } + + pub fn reset_flush_state(&mut self, index: u64) { + self.flush_state = Arc::new(FlushState::new(index)); + } + + // Note: Call `set_has_extra_write` after adding new state changes. + #[inline] + pub fn state_changes_mut(&mut self) -> &mut ER::LogBatch { + if self.state_changes.is_none() { + self.state_changes = Some(Box::new(self.entry_storage().raft_engine().log_batch(0))); + } + self.state_changes.as_mut().unwrap() + } + + #[inline] + pub fn merge_state_changes_to(&mut self, task: &mut WriteTask) { + if self.state_changes.is_none() { + return; + } + task.extra_write + .merge_v2(Box::into_inner(self.state_changes.take().unwrap())); + } + + #[inline] + pub fn split_flow_control_mut(&mut self) -> &mut SplitFlowControl { + &mut self.split_flow_control + } + + #[inline] + pub fn refresh_leader_transferee(&mut self) -> u64 { + mem::replace( + &mut self.leader_transferee, + self.raft_group + .raft + .lead_transferee + .unwrap_or(raft::INVALID_ID), + ) + } + + #[inline] + pub fn long_uncommitted_threshold(&self) -> Duration { + Duration::from_secs(self.long_uncommitted_threshold) + } + + #[inline] + pub fn set_long_uncommitted_threshold(&mut self, dur: Duration) { + self.long_uncommitted_threshold = cmp::max(dur.as_secs(), 1); + } + + #[inline] + pub fn add_message(&mut self, msg: RaftMessage) { + self.pending_messages.push(msg); + self.set_has_ready(); + } + + #[inline] + pub fn has_pending_messages(&mut self) -> bool { + !self.pending_messages.is_empty() + } + + #[inline] + pub fn take_pending_messages(&mut self) -> Vec { + mem::take(&mut self.pending_messages) + } + + #[inline] + pub fn gc_peer_context(&self) -> &GcPeerContext { + &self.gc_peer_context + } + + #[inline] + pub fn gc_peer_context_mut(&mut self) -> &mut GcPeerContext { + &mut self.gc_peer_context + } + + #[inline] + pub fn update_last_sent_snapshot_index(&mut self, i: u64) { + if i > self.last_sent_snapshot_index { + self.last_sent_snapshot_index = i; + } + } + + #[inline] + pub fn last_sent_snapshot_index(&self) -> u64 { + self.last_sent_snapshot_index + } + + #[inline] + pub fn index_term(&self, idx: u64) -> u64 { + match self.raft_group.raft.raft_log.term(idx) { + Ok(t) => t, + Err(e) => slog_panic!(self.logger, "failed to load term"; "index" => idx, "err" => ?e), + } + } + + #[inline] + pub fn abnormal_peer_context_mut(&mut self) -> &mut AbnormalPeerContext { + &mut self.abnormal_peer_context + } + + #[inline] + pub fn abnormal_peer_context(&self) -> &AbnormalPeerContext { + &self.abnormal_peer_context + } + + pub fn any_new_peer_catch_up(&mut self, from_peer_id: u64) -> bool { + // no pending or down peers + if self.abnormal_peer_context.is_empty() { + return false; + } + if !self.is_leader() { + self.abnormal_peer_context.reset(); + return false; + } + + if self + .abnormal_peer_context + .down_peers() + .contains(&from_peer_id) + { + return true; + } + + let logger = self.logger.clone(); + self.abnormal_peer_context + .retain_pending_peers(|(peer_id, pending_after)| { + // TODO check wait data peers here + let truncated_idx = self.raft_group.store().entry_storage().truncated_index(); + if let Some(progress) = self.raft_group.raft.prs().get(*peer_id) { + if progress.matched >= truncated_idx { + let elapsed = duration_to_sec(pending_after.saturating_elapsed()); + RAFT_PEER_PENDING_DURATION.observe(elapsed); + debug!( + logger, + "peer has caught up logs"; + "from_peer_id" => %from_peer_id, + "takes" => %elapsed, + ); + return false; + } + } + true + }) + } +} diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs new file mode 100644 index 00000000000..7edf8c02f09 --- /dev/null +++ b/components/raftstore-v2/src/raft/storage.rs @@ -0,0 +1,578 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::{RefCell, RefMut}, + fmt::{self, Debug, Formatter}, +}; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, +}; +use raft::{ + eraftpb::{ConfState, Entry, Snapshot}, + GetEntriesContext, RaftState, INVALID_ID, +}; +use raftstore::store::{util, EntryStorage, ReadTask}; +use slog::{o, Logger}; +use tikv_util::{box_err, store::find_peer, worker::Scheduler}; + +use crate::{ + operation::{ApplyTrace, GenSnapTask, SnapState, SplitInit}, + Result, +}; + +/// A storage for raft. +/// +/// It's similar to `PeerStorage` in v1. +pub struct Storage { + entry_storage: EntryStorage, + peer: metapb::Peer, + region_state: RegionLocalState, + /// Whether states has been persisted before. If a peer is just created by + /// by messages, it has not persisted any states, we need to persist them + /// at least once dispite whether the state changes since create. + ever_persisted: bool, + /// It may have dirty data after split. Use a flag to indicate whether it + /// has finished clean up. + has_dirty_data: bool, + logger: Logger, + + /// Snapshot part. + pub snap_states: RefCell>, + pub gen_snap_task: RefCell>>, + split_init: Option>, + /// The flushed index of all CFs. + apply_trace: ApplyTrace, +} + +impl Debug for Storage { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "Storage of [region {}] {}", + self.region().get_id(), + self.peer.get_id() + ) + } +} + +impl Storage { + #[inline] + pub fn entry_storage(&self) -> &EntryStorage { + &self.entry_storage + } + + #[inline] + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + &mut self.entry_storage + } + + #[inline] + pub fn region_state(&self) -> &RegionLocalState { + &self.region_state + } + + #[inline] + pub fn region(&self) -> &metapb::Region { + self.region_state.get_region() + } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + &self.peer + } + + #[inline] + pub fn logger(&self) -> &Logger { + &self.logger + } + + #[inline] + pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { + self.gen_snap_task.borrow_mut() + } + + #[inline] + pub fn cancel_snap_task(&self, to_peer_id: Option) { + if to_peer_id.is_none() { + self.gen_snap_task.borrow_mut().take(); + return; + } + let to = to_peer_id.unwrap(); + let mut task = self.gen_snap_task.borrow_mut(); + if let Some(t) = &**task { + if to == t.to_peer() { + *task = Box::new(None); + }; + } + } + + #[inline] + pub fn apply_trace_mut(&mut self) -> &mut ApplyTrace { + &mut self.apply_trace + } + + #[inline] + pub fn apply_trace(&self) -> &ApplyTrace { + &self.apply_trace + } + + #[inline] + pub fn set_has_dirty_data(&mut self, has_dirty_data: bool) { + self.has_dirty_data = has_dirty_data; + } + + #[inline] + pub fn has_dirty_data(&self) -> bool { + self.has_dirty_data + } +} + +impl Storage { + pub(crate) fn create( + store_id: u64, + region_state: RegionLocalState, + raft_state: RaftLocalState, + apply_state: RaftApplyState, + engine: ER, + read_scheduler: Scheduler>, + persisted: bool, + apply_trace: ApplyTrace, + logger: &Logger, + ) -> Result { + let peer = find_peer(region_state.get_region(), store_id); + let peer = match peer { + Some(p) if p.get_id() != INVALID_ID => p, + _ => { + return Err(box_err!("no valid peer found in {:?}", region_state)); + } + }; + let region = region_state.get_region(); + let logger = logger.new(o!("region_id" => region.id, "peer_id" => peer.get_id())); + let has_dirty_data = + match engine.get_dirty_mark(region.get_id(), region_state.get_tablet_index()) { + Ok(b) => b, + Err(e) => { + return Err(box_err!( + "failed to get dirty mark for {}: {:?}", + region.get_id(), + e + )); + } + }; + let entry_storage = EntryStorage::new( + peer.get_id(), + engine, + raft_state, + apply_state, + region, + read_scheduler, + )?; + + Ok(Storage { + entry_storage, + peer: peer.clone(), + region_state, + ever_persisted: persisted, + has_dirty_data, + logger, + snap_states: RefCell::new(HashMap::default()), + gen_snap_task: RefCell::new(Box::new(None)), + split_init: None, + apply_trace, + }) + } + + #[inline] + pub fn region_state_mut(&mut self) -> &mut RegionLocalState { + &mut self.region_state + } + + #[inline] + pub fn split_init_mut(&mut self) -> &mut Option> { + &mut self.split_init + } + + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + self.entry_storage.raft_state() + } + + #[inline] + pub fn read_scheduler(&self) -> Scheduler> { + self.entry_storage.read_scheduler() + } + + #[inline] + pub fn apply_state(&self) -> &RaftApplyState { + self.entry_storage.apply_state() + } + + /// Check if the storage is initialized. + /// + /// The storage is considered initialized when data is applied in memory. + #[inline] + pub fn is_initialized(&self) -> bool { + self.region_state.get_tablet_index() != 0 + } + + pub fn ever_persisted(&self) -> bool { + self.ever_persisted + } + + pub fn set_ever_persisted(&mut self) { + self.ever_persisted = true; + } + + #[inline] + pub fn take_gen_snap_task(&mut self) -> Option { + self.gen_snap_task.get_mut().take() + } + + #[inline] + pub fn tablet_index(&self) -> u64 { + self.region_state.get_tablet_index() + } + + #[inline] + pub fn set_region_state(&mut self, state: RegionLocalState) { + self.region_state = state; + for peer in self.region_state.get_region().get_peers() { + if peer.get_id() == self.peer.get_id() { + self.peer = peer.clone(); + break; + } + } + } +} + +impl raft::Storage for Storage { + fn initial_state(&self) -> raft::Result { + let hard_state = self.raft_state().get_hard_state().clone(); + // We will persist hard state no matter if it's initialized or not in + // v2, So hard state may not be empty. But when it becomes initialized, + // commit must be changed. + assert_eq!( + hard_state.commit == 0, + !self.is_initialized(), + "region state doesn't match raft state {:?} vs {:?}", + self.region_state(), + self.raft_state() + ); + + if hard_state.commit == 0 { + // If it's uninitialized, return empty state as we consider every + // states are empty at the very beginning. + return Ok(RaftState::new(hard_state, ConfState::default())); + } + Ok(RaftState::new( + hard_state, + util::conf_state_from_region(self.region()), + )) + } + + #[inline] + fn entries( + &self, + low: u64, + high: u64, + max_size: impl Into>, + context: GetEntriesContext, + ) -> raft::Result> { + self.entry_storage + .entries(low, high, max_size.into().unwrap_or(u64::MAX), context) + } + + #[inline] + fn term(&self, idx: u64) -> raft::Result { + self.entry_storage.term(idx) + } + + #[inline] + fn first_index(&self) -> raft::Result { + Ok(self.entry_storage.first_index()) + } + + #[inline] + fn last_index(&self) -> raft::Result { + Ok(self.entry_storage.last_index()) + } + + fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + self.snapshot(request_index, to) + } +} + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + mpsc::{sync_channel, Receiver, SyncSender}, + Arc, + }, + time::Duration, + }; + + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactory}, + }; + use engine_traits::{ + FlushState, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, + }; + use kvproto::{ + metapb::{Peer, Region}, + raft_serverpb::PeerState, + }; + use raft::{Error as RaftError, StorageError}; + use raftstore::{ + coprocessor::CoprocessorHost, + store::{ + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, Config, FetchedLogs, + GenSnapRes, ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + }, + }; + use slog::o; + use tempfile::TempDir; + use tikv_util::worker::Worker; + + use super::*; + use crate::{ + fsm::ApplyResReporter, + operation::{test_util::create_tmp_importer, write_initial_states, CatchUpLogs}, + raft::Apply, + router::ApplyRes, + }; + + #[derive(Clone)] + pub struct TestRouter { + ch: SyncSender, + } + + impl TestRouter { + pub fn new() -> (Self, Receiver) { + let (tx, rx) = sync_channel(1); + (Self { ch: tx }, rx) + } + } + + impl AsyncReadNotifier for TestRouter { + fn notify_logs_fetched(&self, _region_id: u64, _fetched_logs: FetchedLogs) { + unreachable!(); + } + + fn notify_snapshot_generated(&self, _region_id: u64, res: GenSnapRes) { + self.ch.send(res).unwrap(); + } + } + + impl ApplyResReporter for TestRouter { + fn report(&self, _res: ApplyRes) {} + fn redirect_catch_up_logs(&self, _c: CatchUpLogs) {} + } + + fn new_region() -> Region { + let mut region = Region::default(); + region.set_id(4); + let mut p = Peer::default(); + p.set_id(5); + p.set_store_id(6); + region.mut_peers().push(p); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(4); + region + } + + fn new_entry(index: u64, term: u64) -> Entry { + let mut e = Entry::default(); + e.set_index(index); + e.set_term(term); + e + } + + #[test] + fn test_apply_snapshot() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let mgr = + TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap(), None).unwrap(); + let engines = engine_test::new_temp_engine(&path); + let raft_engine = engines.raft.clone(); + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + // building a tablet factory + let ops = DbOptions::default(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path().join("tablets")).unwrap(); + let worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let sched = worker.scheduler(); + let logger = slog_global::borrow_global().new(o!()); + let mut s = Storage::new(4, 6, raft_engine.clone(), sched, &logger.clone()) + .unwrap() + .unwrap(); + + let mut task = WriteTask::new(region.get_id(), 5, 1); + let entries = (RAFT_INIT_LOG_INDEX + 1..RAFT_INIT_LOG_INDEX + 10) + .map(|i| new_entry(i, RAFT_INIT_LOG_TERM)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + write_to_db_for_test(&engines, task); + + let snap_index = RAFT_INIT_LOG_INDEX + 20; + let snap_term = 9; + let path = mgr.final_recv_path(&TabletSnapKey::new( + region.get_id(), + 5, + snap_term, + snap_index, + )); + reg.tablet_factory() + .open_tablet(TabletContext::new(®ion, Some(snap_index)), &path) + .unwrap(); + let snapshot = new_empty_snapshot(region.clone(), snap_index, snap_term, false); + let mut task = WriteTask::new(region.get_id(), 5, 1); + s.apply_snapshot(&snapshot, &mut task, &mgr, ®, None) + .unwrap(); + // Add more entries to check if old entries are cleared. If not, it should panic + // with memtable hole when using raft engine. + let entries = (snap_index + 1..=snap_index + 10) + .map(|i| new_entry(i, snap_term)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + + assert!(!reg.tablet_path(region.get_id(), snap_index).exists()); + assert!(!task.persisted_cbs.is_empty()); + + write_to_db_for_test(&engines, task); + + assert!(reg.tablet_path(region.get_id(), snap_index).exists()); + + // It can be set before load tablet. + assert_eq!(PeerState::Normal, s.region_state().get_state()); + assert_eq!(snap_index, s.entry_storage().truncated_index()); + assert_eq!(snap_term, s.entry_storage().truncated_term()); + assert_eq!(snap_term, s.entry_storage().last_term()); + assert_eq!(snap_index + 10, s.entry_storage().raft_state().last_index); + // This index can't be set before load tablet. + assert_ne!(snap_index, s.entry_storage().applied_index()); + assert_ne!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); + + s.on_applied_snapshot(); + assert_eq!(snap_index, s.entry_storage().applied_index()); + assert_eq!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); + } + + #[test] + fn test_storage_create_snapshot() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + .unwrap(); + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + let mgr = + TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap(), None).unwrap(); + // building a tablet factory + let ops = DbOptions::default(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path().join("tablets")).unwrap(); + let tablet_ctx = TabletContext::new(®ion, Some(10)); + reg.load(tablet_ctx, true).unwrap(); + // setup read runner worker and peer storage + let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let sched = worker.scheduler(); + let logger = slog_global::borrow_global().new(o!()); + let s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) + .unwrap() + .unwrap(); + let (router, rx) = TestRouter::new(); + let mut read_runner = ReadRunner::new(router.clone(), raft_engine); + read_runner.set_snap_mgr(mgr.clone()); + worker.start(read_runner); + let mut state = RegionLocalState::default(); + state.set_region(region.clone()); + let (_tmp_dir, importer) = create_tmp_importer(); + let host = CoprocessorHost::::default(); + // setup peer applyer + let mut apply = Apply::new( + &Config::default(), + region.get_peers()[0].clone(), + state, + router, + reg, + sched, + Arc::new(FlushState::new(5)), + None, + 5, + None, + importer, + host, + logger, + ); + + // Test get snapshot + let to_peer_id = 7; + let snap = s.snapshot(0, to_peer_id); + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.schedule_gen_snapshot(gen_task); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.on_snapshot_generated(res); + assert_eq!(s.snapshot(0, 8).unwrap_err(), unavailable); + assert!(s.snap_states.borrow().get(&8).is_some()); + let snap = match *s.snap_states.borrow().get(&to_peer_id).unwrap() { + SnapState::Generated(ref snap) => *snap.clone(), + ref s => panic!("unexpected state: {:?}", s), + }; + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert_eq!(snap.get_data().is_empty(), false); + let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); + let checkpointer_path = mgr.tablet_gen_path(&snap_key); + assert!(checkpointer_path.exists()); + s.snapshot(0, to_peer_id).unwrap(); + + // Test cancel snapshot + let snap = s.snapshot(0, 7); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.schedule_gen_snapshot(gen_task); + let _res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None); + assert!(s.snap_states.borrow().get(&to_peer_id).is_none()); + + // Test get twice snapshot and cancel once. + // get snapshot a + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task_a = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.set_apply_progress(1, 5); + apply.schedule_gen_snapshot(gen_task_a); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None); + // cancel get snapshot a, try get snaphsot b + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task_b = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.set_apply_progress(10, 5); + apply.schedule_gen_snapshot(gen_task_b); + // on snapshot a and b + assert_eq!(s.on_snapshot_generated(res), false); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(s.on_snapshot_generated(res), true); + } +} diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs new file mode 100644 index 00000000000..b28dc95aa35 --- /dev/null +++ b/components/raftstore-v2/src/router/imp.rs @@ -0,0 +1,251 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + borrow::Cow, + sync::{Arc, Mutex}, +}; + +use crossbeam::channel::{SendError, TrySendError}; +use engine_traits::{KvEngine, RaftEngine}; +use futures::Future; +use kvproto::{ + kvrpcpb::ExtraOp, + metapb::RegionEpoch, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use raftstore::{ + router::CdcHandle, + store::{ + fsm::ChangeObserver, AsyncReadNotifier, Callback, FetchedLogs, GenSnapRes, RegionSnapshot, + }, +}; +use slog::warn; + +use super::{build_any_channel, message::CaptureChange, PeerMsg, QueryResChannel, QueryResult}; +use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; + +impl AsyncReadNotifier for StoreRouter { + fn notify_logs_fetched(&self, region_id: u64, fetched_logs: FetchedLogs) { + let _ = self.force_send(region_id, PeerMsg::LogsFetched(fetched_logs)); + } + + fn notify_snapshot_generated(&self, region_id: u64, snapshot: GenSnapRes) { + let _ = self.force_send(region_id, PeerMsg::SnapshotGenerated(snapshot)); + } +} + +impl raftstore::coprocessor::StoreHandle for StoreRouter { + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string()); + let res = self.send(region_id, msg); + if let Err(e) = res { + warn!( + self.logger(), + "failed to send ask split"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let res = self.send( + region_id, + PeerMsg::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + }, + ); + if let Err(e) = res { + warn!( + self.logger(), + "failed to refresh region buckets"; + "err" => %e, + ); + } + } + + fn update_compute_hash_result( + &self, + _region_id: u64, + _index: u64, + _context: Vec, + _hash: Vec, + ) { + // TODO + } +} + +/// A router that routes messages to the raftstore +pub struct RaftRouter +where + EK: KvEngine, + ER: RaftEngine, +{ + router: StoreRouter, + local_reader: LocalReader>, +} + +impl Clone for RaftRouter +where + EK: KvEngine, + ER: RaftEngine, +{ + fn clone(&self) -> Self { + RaftRouter { + router: self.router.clone(), + local_reader: self.local_reader.clone(), + } + } +} + +impl RaftRouter { + pub fn new(store_id: u64, router: StoreRouter) -> Self { + let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); + + let logger = router.logger().clone(); + RaftRouter { + router: router.clone(), + local_reader: LocalReader::new(store_meta, router, logger), + } + } + + pub fn store_router(&self) -> &StoreRouter { + &self.router + } + + pub fn send(&self, addr: u64, msg: PeerMsg) -> Result<(), TrySendError> { + self.router.send(addr, msg) + } + + #[inline] + pub fn check_send(&self, addr: u64, msg: PeerMsg) -> crate::Result<()> { + self.router.check_send(addr, msg) + } + + pub fn store_meta(&self) -> &Arc>> { + self.local_reader.store_meta() + } + + pub fn send_raft_message( + &self, + msg: Box, + ) -> std::result::Result<(), TrySendError>> { + self.router.send_raft_message(msg) + } + + pub fn snapshot( + &mut self, + req: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send + { + self.local_reader.snapshot(req) + } + + #[cfg(any(test, feature = "testexport"))] + pub fn new_with_store_meta( + router: StoreRouter, + store_meta: Arc>>, + ) -> Self { + let logger = router.logger().clone(); + RaftRouter { + router: router.clone(), + local_reader: LocalReader::new(store_meta, router, logger), + } + } +} + +impl CdcHandle for RaftRouter { + fn capture_change( + &self, + region_id: u64, + region_epoch: RegionEpoch, + observer: ChangeObserver, + callback: Callback, + ) -> crate::Result<()> { + let (snap_cb, _) = build_any_channel(Box::new(move |args| { + let (resp, snap) = (&args.0, args.1.take()); + if let Some(snap) = snap { + let snapshot: RegionSnapshot = match snap.downcast() { + Ok(s) => *s, + Err(t) => unreachable!("snapshot type should be the same: {:?}", t), + }; + callback.invoke_read(raftstore::store::ReadResponse { + response: Default::default(), + snapshot: Some(snapshot), + txn_extra_op: ExtraOp::Noop, + }) + } else { + callback.invoke_read(raftstore::store::ReadResponse { + response: resp.clone(), + snapshot: None, + txn_extra_op: ExtraOp::Noop, + }); + } + })); + if let Err(SendError(msg)) = self.router.force_send( + region_id, + PeerMsg::CaptureChange(CaptureChange { + observer, + region_epoch, + snap_cb, + }), + ) { + warn!(self.router.logger(), "failed to send capture change msg"; "msg" => ?msg); + return Err(crate::Error::RegionNotFound(region_id)); + } + Ok(()) + } + + fn check_leadership( + &self, + region_id: u64, + callback: Callback, + ) -> crate::Result<()> { + let (ch, _) = QueryResChannel::with_callback(Box::new(|res| { + let resp = match res { + QueryResult::Read(_) => raftstore::store::ReadResponse { + response: Default::default(), + snapshot: None, + txn_extra_op: ExtraOp::Noop, + }, + QueryResult::Response(resp) => raftstore::store::ReadResponse { + response: resp.clone(), + snapshot: None, + txn_extra_op: ExtraOp::Noop, + }, + }; + callback.invoke_read(resp); + })); + if let Err(SendError(msg)) = self + .router + .force_send(region_id, PeerMsg::LeaderCallback(ch)) + { + warn!(self.router.logger(), "failed to send capture change msg"; "msg" => ?msg); + return Err(crate::Error::RegionNotFound(region_id)); + } + Ok(()) + } +} diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs new file mode 100644 index 00000000000..6c8d1136b3a --- /dev/null +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -0,0 +1,28 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use pd_client::{BucketMeta, BucketStat}; +use raftstore::store::fsm::ApplyMetrics; + +use super::message::CaptureChange; +use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask}; + +#[derive(Debug)] +pub enum ApplyTask { + CommittedEntries(CommittedEntries), + Snapshot(GenSnapTask), + /// Writes that doesn't care consistency. + UnsafeWrite(Box<[u8]>), + ManualFlush, + RefreshBucketStat(std::sync::Arc), + CaptureApply(CaptureChange), +} + +#[derive(Debug, Default)] +pub struct ApplyRes { + pub applied_index: u64, + pub applied_term: u64, + pub admin_result: Box<[AdminCmdResult]>, + pub modifications: DataTrace, + pub metrics: ApplyMetrics, + pub bucket_stat: Option, +} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs new file mode 100644 index 00000000000..3f761c74f94 --- /dev/null +++ b/components/raftstore-v2/src/router/message.rs @@ -0,0 +1,317 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] + +use kvproto::{ + import_sstpb::SstMeta, + metapb, + metapb::RegionEpoch, + raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, + raft_serverpb::RaftMessage, +}; +use raftstore::store::{ + fsm::ChangeObserver, metrics::RaftEventDurationType, simple_write::SimpleWriteBinary, + FetchedLogs, GenSnapRes, +}; +use resource_control::ResourceMetered; +use tikv_util::time::Instant; + +use super::response_channel::{ + AnyResChannel, CmdResChannel, CmdResSubscriber, DebugInfoChannel, QueryResChannel, + QueryResSubscriber, +}; +use crate::{ + operation::{CatchUpLogs, RequestHalfSplit, RequestSplit, SplitInit}, + router::ApplyRes, +}; + +#[derive(Debug, Clone, Copy, PartialEq, Hash)] +#[repr(u8)] +pub enum PeerTick { + Raft = 0, + CompactLog = 1, + SplitRegionCheck = 2, + PdHeartbeat = 3, + CheckMerge = 4, + CheckPeerStaleState = 5, + EntryCacheEvict = 6, + CheckLeaderLease = 7, + ReactivateMemoryLock = 8, + ReportBuckets = 9, + CheckLongUncommitted = 10, + GcPeer = 11, +} + +impl PeerTick { + pub const VARIANT_COUNT: usize = Self::all_ticks().len(); + + #[inline] + pub fn tag(self) -> &'static str { + match self { + PeerTick::Raft => "raft", + PeerTick::CompactLog => "compact_log", + PeerTick::SplitRegionCheck => "split_region_check", + PeerTick::PdHeartbeat => "pd_heartbeat", + PeerTick::CheckMerge => "check_merge", + PeerTick::CheckPeerStaleState => "check_peer_stale_state", + PeerTick::EntryCacheEvict => "entry_cache_evict", + PeerTick::CheckLeaderLease => "check_leader_lease", + PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", + PeerTick::ReportBuckets => "report_buckets", + PeerTick::CheckLongUncommitted => "check_long_uncommitted", + PeerTick::GcPeer => "gc_peer", + } + } + + pub const fn all_ticks() -> &'static [PeerTick] { + const TICKS: &[PeerTick] = &[ + PeerTick::Raft, + PeerTick::CompactLog, + PeerTick::SplitRegionCheck, + PeerTick::PdHeartbeat, + PeerTick::CheckMerge, + PeerTick::CheckPeerStaleState, + PeerTick::EntryCacheEvict, + PeerTick::CheckLeaderLease, + PeerTick::ReactivateMemoryLock, + PeerTick::ReportBuckets, + PeerTick::CheckLongUncommitted, + PeerTick::GcPeer, + ]; + TICKS + } +} + +#[derive(Debug, Clone, Copy)] +pub enum StoreTick { + // No CompactLock and CompactCheck as they should be implemented by peer itself. + PdStoreHeartbeat, + SnapGc, + ConsistencyCheck, + CleanupImportSst, +} + +impl StoreTick { + #[inline] + pub fn tag(self) -> RaftEventDurationType { + match self { + StoreTick::PdStoreHeartbeat => RaftEventDurationType::pd_store_heartbeat, + StoreTick::SnapGc => RaftEventDurationType::snap_gc, + StoreTick::ConsistencyCheck => RaftEventDurationType::consistency_check, + StoreTick::CleanupImportSst => RaftEventDurationType::cleanup_import_sst, + } + } +} + +/// Command that can be handled by raftstore. +#[derive(Debug)] +pub struct RaftRequest { + pub send_time: Instant, + pub request: RaftCmdRequest, + pub ch: C, +} + +impl RaftRequest { + pub fn new(request: RaftCmdRequest, ch: C) -> Self { + RaftRequest { + send_time: Instant::now(), + request, + ch, + } + } +} + +#[derive(Debug)] +pub struct SimpleWrite { + pub send_time: Instant, + pub header: Box, + pub data: SimpleWriteBinary, + pub ch: CmdResChannel, +} + +#[derive(Debug)] +pub struct UnsafeWrite { + pub send_time: Instant, + pub data: SimpleWriteBinary, +} + +#[derive(Debug)] +pub struct CaptureChange { + pub observer: ChangeObserver, + pub region_epoch: RegionEpoch, + // A callback accpets a snapshot. + pub snap_cb: AnyResChannel, +} + +/// Message that can be sent to a peer. +#[derive(Debug)] +pub enum PeerMsg { + /// Raft message is the message sent between raft nodes in the same + /// raft group. Messages need to be redirected to raftstore if target + /// peer doesn't exist. + RaftMessage(Box), + /// Query won't change any state. A typical query is KV read. In most cases, + /// it will be processed using lease or read index. + RaftQuery(RaftRequest), + /// Command changes the inernal states. It will be transformed into logs and + /// applied on all replicas. + SimpleWrite(SimpleWrite), + UnsafeWrite(UnsafeWrite), + /// Command that contains admin requests. + AdminCommand(RaftRequest), + /// Tick is periodical task. If target peer doesn't exist there is a + /// potential that the raft node will not work anymore. + Tick(PeerTick), + /// Result of applying committed entries. The message can't be lost. + ApplyRes(ApplyRes), + LogsFetched(FetchedLogs), + SnapshotGenerated(GenSnapRes), + /// Start the FSM. + Start, + /// Messages from peer to peer in the same store + SplitInit(Box), + SplitInitFinish(u64), + /// A message only used to notify a peer. + Noop, + /// A message that indicates an asynchronous write has finished. + Persisted { + peer_id: u64, + ready_number: u64, + }, + QueryDebugInfo(DebugInfoChannel), + DataFlushed { + cf: &'static str, + tablet_index: u64, + flushed_index: u64, + }, + PeerUnreachable { + to_peer_id: u64, + }, + StoreUnreachable { + to_store_id: u64, + }, + /// Reports whether the snapshot sending is successful or not. + SnapshotSent { + to_peer_id: u64, + status: raft::SnapshotStatus, + }, + RequestSplit { + request: RequestSplit, + ch: CmdResChannel, + }, + RefreshRegionBuckets { + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + }, + RequestHalfSplit { + request: RequestHalfSplit, + ch: CmdResChannel, + }, + UpdateRegionSize { + size: u64, + }, + UpdateRegionKeys { + keys: u64, + }, + ClearRegionSize, + ForceCompactLog, + TabletTrimmed { + tablet_index: u64, + }, + CleanupImportSst(Box<[SstMeta]>), + AskCommitMerge(RaftCmdRequest), + AckCommitMerge { + index: u64, + target_id: u64, + }, + RejectCommitMerge { + index: u64, + }, + // From target [`Apply`] to target [`Peer`]. + RedirectCatchUpLogs(CatchUpLogs), + // From target [`Peer`] to source [`Peer`]. + CatchUpLogs(CatchUpLogs), + /// Capture changes of a region. + CaptureChange(CaptureChange), + LeaderCallback(QueryResChannel), + /// A message that used to check if a flush is happened. + #[cfg(feature = "testexport")] + WaitFlush(super::FlushChannel), +} + +impl ResourceMetered for PeerMsg {} + +impl PeerMsg { + pub fn raft_query(req: RaftCmdRequest) -> (Self, QueryResSubscriber) { + let (ch, sub) = QueryResChannel::pair(); + (PeerMsg::RaftQuery(RaftRequest::new(req, ch)), sub) + } + + pub fn admin_command(req: RaftCmdRequest) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + (PeerMsg::AdminCommand(RaftRequest::new(req, ch)), sub) + } + + pub fn simple_write( + header: Box, + data: SimpleWriteBinary, + ) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + ( + PeerMsg::SimpleWrite(SimpleWrite { + send_time: Instant::now(), + header, + data, + ch, + }), + sub, + ) + } + + pub fn unsafe_write(data: SimpleWriteBinary) -> Self { + PeerMsg::UnsafeWrite(UnsafeWrite { + send_time: Instant::now(), + data, + }) + } + + pub fn request_split( + epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + ( + PeerMsg::RequestSplit { + request: RequestSplit { + epoch, + split_keys, + source: source.into(), + }, + ch, + }, + sub, + ) + } +} + +#[derive(Debug)] +pub enum StoreMsg { + RaftMessage(Box), + SplitInit(Box), + Tick(StoreTick), + Start, + StoreUnreachable { + to_store_id: u64, + }, + AskCommitMerge(RaftCmdRequest), + /// A message that used to check if a flush is happened. + #[cfg(feature = "testexport")] + WaitFlush { + region_id: u64, + ch: super::FlushChannel, + }, +} + +impl ResourceMetered for StoreMsg {} diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs new file mode 100644 index 00000000000..2d0011c1ef0 --- /dev/null +++ b/components/raftstore-v2/src/router/mod.rs @@ -0,0 +1,22 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod imp; +mod internal_message; +pub mod message; +mod response_channel; + +pub(crate) use self::internal_message::ApplyTask; +#[cfg(feature = "testexport")] +pub use self::response_channel::FlushChannel; +#[cfg(feature = "testexport")] +pub use self::response_channel::FlushSubscriber; +pub use self::{ + imp::RaftRouter, + internal_message::ApplyRes, + message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, + response_channel::{ + build_any_channel, AnyResChannel, AnyResSubscriber, BaseSubscriber, CmdResChannel, + CmdResChannelBuilder, CmdResEvent, CmdResStream, CmdResSubscriber, DebugInfoChannel, + DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, + }, +}; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs new file mode 100644 index 00000000000..c300b6d8726 --- /dev/null +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -0,0 +1,797 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! Variants of channels for `Msg`. +//! - `Read`: a channel for read only requests including `StatusRequest`, +//! `GetRequest` and `SnapRequest` +//! - `Write`: a channel for write only requests including `AdminRequest` +//! `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. +//! +//! Prefer channel over callback because: +//! 1. channel can be reused, hence reduce allocations (not yet implemented). +//! 2. channel may not need dynamic dispatch. +//! 3. caller can use async fashion. +//! 4. there will be no callback leak. + +use std::{ + any::Any, + cell::UnsafeCell, + fmt::{self, Debug, Formatter}, + future::Future, + pin::Pin, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + task::{Context, Poll}, +}; + +use futures::{task::AtomicWaker, FutureExt, Stream}; +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, raft_cmdpb::RaftCmdResponse}; +use raftstore::store::{ + local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, + WriteCallback, +}; +use tracker::{get_tls_tracker_token, TrackerToken}; + +union Tracker { + read: TrackerToken, + write: TimeTracker, +} + +/// A struct allows to watch and notify specific events. +/// +/// There are two different events: state and payload. Obviously, state events +/// have no payload. At most 30 states can be defined. There can be only one +/// type of payload. +struct EventCore { + /// Every event will have two bits. + /// - 0b00 means the event is not fired and not subscribed. + /// - 0b01 means the event is fired and not subscribed. + /// - 0b10 means the event is not fired and subscribed. + /// - 0b11 means the event is fired and subscribed. + /// Event 0 and Event 31 is reserved as payload and cancel respectively. + /// Other events should be defined within [1, 30]. + event: AtomicU64, + /// Even a channel supports multiple events, it's not necessary to trigger + /// all of them. `event_mask` is used to filter unnecessary events. + event_mask: u32, + res: UnsafeCell>, + before_set: UnsafeCell>>, + // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. + waker: AtomicWaker, + tracker: UnsafeCell, +} + +unsafe impl Send for EventCore {} + +const PAYLOAD_EVENT: u64 = 0; +const CANCEL_EVENT: u64 = 31; + +const fn event_mask_bit_of(event: u64) -> u32 { + 1 << event +} + +#[inline] +const fn subscribed_bit_of(event: u64) -> u64 { + 1 << (event * 2) +} + +#[inline] +const fn fired_bit_of(event: u64) -> u64 { + 1 << (event * 2 + 1) +} + +impl EventCore { + #[inline] + fn notify_event(&self, event: u64) { + if self.event_mask & event_mask_bit_of(event) != 0 { + let previous = self.event.fetch_or(fired_bit_of(event), Ordering::AcqRel); + if previous & subscribed_bit_of(event) != 0 { + self.waker.wake() + } + } + } + + /// Set the result. + /// + /// After this call, no events should be notified. + #[inline] + fn set_result(&self, mut result: Res) { + unsafe { + if let Some(cb) = (*self.before_set.get()).take() { + cb(&mut result); + } + *self.res.get() = Some(result); + } + let previous = self.event.fetch_or( + fired_bit_of(PAYLOAD_EVENT) | fired_bit_of(CANCEL_EVENT), + Ordering::AcqRel, + ); + if previous & subscribed_bit_of(PAYLOAD_EVENT) != 0 { + self.waker.wake() + } + } + + /// Cancel all subscribers. + /// + /// After this call, no events should be notified and no result should be + /// set. + #[inline] + fn cancel(&self) { + let mut previous = self + .event + .fetch_or(fired_bit_of(CANCEL_EVENT), Ordering::AcqRel); + let subscribed_bit = subscribed_bit_of(0); + while previous != 0 { + // Not notified yet. + if previous & 0b11 == subscribed_bit { + self.waker.wake(); + return; + } + previous >>= 2; + } + } +} + +struct WaitEvent<'a, Res> { + event: u64, + core: &'a EventCore, +} + +#[inline] +fn check_bit(e: u64, fired_bit: u64) -> Option { + if e & fired_bit != 0 { + return Some(true); + } + let cancel_bit = fired_bit_of(CANCEL_EVENT); + if e & cancel_bit != 0 { + return Some(false); + } + None +} + +impl<'a, Res> Future for WaitEvent<'a, Res> { + type Output = bool; + + #[inline] + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let event = &self.core.event; + let mut e = event.load(Ordering::Relaxed); + let fired_bit = fired_bit_of(self.event); + if let Some(b) = check_bit(e, fired_bit) { + return Poll::Ready(b); + } + self.core.waker.register(cx.waker()); + let subscribed_bit = subscribed_bit_of(self.event); + loop { + match event.compare_exchange_weak( + e, + e | subscribed_bit, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => return Poll::Pending, + Err(v) => e = v, + }; + if let Some(b) = check_bit(e, fired_bit) { + return Poll::Ready(b); + } + } + } +} + +struct WaitResult<'a, Res> { + sub: &'a BaseSubscriber, +} + +impl<'a, Res> Future for WaitResult<'a, Res> { + type Output = Option; + + #[inline] + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let event = &self.sub.core.event; + let fired_bit = fired_bit_of(PAYLOAD_EVENT); + let mut e = event.load(Ordering::Relaxed); + if check_bit(e, fired_bit).is_some() { + unsafe { + return Poll::Ready((*self.sub.core.res.get()).take()); + } + } + let subscribed_bit = subscribed_bit_of(PAYLOAD_EVENT); + self.sub.core.waker.register(cx.waker()); + loop { + match event.compare_exchange_weak( + e, + e | subscribed_bit, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => return Poll::Pending, + Err(v) => e = v, + }; + if check_bit(e, fired_bit).is_some() { + unsafe { + return Poll::Ready((*self.sub.core.res.get()).take()); + } + } + } + } +} + +/// A base subscriber that contains most common implementation of subscribers. +pub struct BaseSubscriber { + core: Arc>, +} + +impl BaseSubscriber { + /// Wait for the result. + #[inline] + pub async fn result(self) -> Option { + WaitResult { sub: &self }.await + } + + /// Test if the result is ready without any polling. + #[inline] + pub fn has_result(&self) -> bool { + let e = self.core.event.load(Ordering::Relaxed); + check_bit(e, fired_bit_of(PAYLOAD_EVENT)).is_some() + } +} + +unsafe impl Send for BaseSubscriber {} +unsafe impl Sync for BaseSubscriber {} + +/// A base channel that contains most common implementation of channels. +pub struct BaseChannel { + core: Arc>, +} + +#[inline] +fn pair() -> (BaseChannel, BaseSubscriber) { + let tracker = Tracker { + read: get_tls_tracker_token(), + }; + BaseChannel::::with_mask(u32::MAX, tracker) +} + +impl BaseChannel { + #[inline] + fn with_mask(mask: u32, tracker: Tracker) -> (Self, BaseSubscriber) { + let core: Arc> = Arc::new(EventCore { + event: AtomicU64::new(0), + res: UnsafeCell::new(None), + event_mask: mask, + before_set: UnsafeCell::new(None), + waker: AtomicWaker::new(), + tracker: UnsafeCell::new(tracker), + }); + (Self { core: core.clone() }, BaseSubscriber { core }) + } + + /// Sets the final result. + #[inline] + pub fn set_result(self, res: Res) { + self.core.set_result(res); + } + + pub fn with_callback(f: Box) -> (Self, BaseSubscriber) { + let (c, s) = pair(); + unsafe { + *c.core.before_set.get() = Some(f); + } + (c, s) + } +} + +impl Drop for BaseChannel { + #[inline] + fn drop(&mut self) { + self.core.cancel(); + } +} + +unsafe impl Send for BaseChannel {} +unsafe impl Sync for BaseChannel {} + +pub type CmdResSubscriber = BaseSubscriber; + +impl CmdResSubscriber { + pub async fn wait_proposed(&mut self) -> bool { + WaitEvent { + event: CmdResChannel::PROPOSED_EVENT, + core: &self.core, + } + .await + } + + pub async fn wait_committed(&mut self) -> bool { + WaitEvent { + event: CmdResChannel::COMMITTED_EVENT, + core: &self.core, + } + .await + } +} + +#[derive(Clone, Copy, Debug)] +enum CmdResPollStage { + ExpectProposed, + ExpectCommitted, + ExpectResult, + Drained, +} + +impl CmdResPollStage { + #[inline] + fn init(event_mask: u32) -> CmdResPollStage { + if event_mask & event_mask_bit_of(CmdResChannel::PROPOSED_EVENT) != 0 { + CmdResPollStage::ExpectProposed + } else if event_mask & event_mask_bit_of(CmdResChannel::COMMITTED_EVENT) != 0 { + CmdResPollStage::ExpectCommitted + } else { + CmdResPollStage::ExpectResult + } + } + + #[inline] + fn next(&mut self, event_mask: u32) { + *self = match self { + CmdResPollStage::ExpectProposed => { + if event_mask & event_mask_bit_of(CmdResChannel::COMMITTED_EVENT) == 0 { + CmdResPollStage::ExpectResult + } else { + CmdResPollStage::ExpectCommitted + } + } + CmdResPollStage::ExpectCommitted => CmdResPollStage::ExpectResult, + CmdResPollStage::ExpectResult => CmdResPollStage::Drained, + CmdResPollStage::Drained => CmdResPollStage::Drained, + } + } +} + +#[derive(Debug)] +pub enum CmdResEvent { + Proposed, + Committed, + Finished(RaftCmdResponse), +} + +pub struct CmdResStream { + sub: CmdResSubscriber, + stage: CmdResPollStage, +} + +impl CmdResStream { + #[inline] + pub fn new(sub: CmdResSubscriber) -> Self { + Self { + stage: CmdResPollStage::init(sub.core.event_mask), + sub, + } + } +} + +impl Stream for CmdResStream { + type Item = CmdResEvent; + + #[inline] + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.get_mut(); + loop { + match stream.stage { + CmdResPollStage::ExpectProposed => { + match (WaitEvent { + event: CmdResChannel::PROPOSED_EVENT, + core: &stream.sub.core, + }) + .poll_unpin(cx) + { + Poll::Pending => return Poll::Pending, + Poll::Ready(b) => { + stream.stage.next(stream.sub.core.event_mask); + if b { + return Poll::Ready(Some(CmdResEvent::Proposed)); + } + } + } + } + CmdResPollStage::ExpectCommitted => { + match (WaitEvent { + event: CmdResChannel::COMMITTED_EVENT, + core: &stream.sub.core, + }) + .poll_unpin(cx) + { + Poll::Pending => return Poll::Pending, + Poll::Ready(b) => { + stream.stage.next(stream.sub.core.event_mask); + if b { + return Poll::Ready(Some(CmdResEvent::Committed)); + } + } + } + } + CmdResPollStage::ExpectResult => { + match (WaitResult { sub: &stream.sub }).poll_unpin(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(res) => { + stream.stage.next(stream.sub.core.event_mask); + if let Some(res) = res { + return Poll::Ready(Some(CmdResEvent::Finished(res))); + } + } + } + } + CmdResPollStage::Drained => return Poll::Ready(None), + } + } + } +} + +pub type CmdResChannel = BaseChannel; + +impl Debug for CmdResChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "CmdResChannel") + } +} + +#[derive(Default)] +pub struct CmdResChannelBuilder { + event_mask: u32, + before_set: Option>, +} + +impl CmdResChannelBuilder { + #[inline] + pub fn subscribe_proposed(&mut self) -> &mut Self { + self.event_mask |= event_mask_bit_of(CmdResChannel::PROPOSED_EVENT); + self + } + + #[inline] + pub fn subscribe_committed(&mut self) -> &mut Self { + self.event_mask |= event_mask_bit_of(CmdResChannel::COMMITTED_EVENT); + self + } + + #[inline] + pub fn before_set( + &mut self, + f: impl FnOnce(&mut RaftCmdResponse) + Send + 'static, + ) -> &mut Self { + self.before_set = Some(Box::new(f)); + self + } + + #[inline] + pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { + let tracker = Tracker { + write: TimeTracker::default(), + }; + let (c, s) = CmdResChannel::with_mask(self.event_mask, tracker); + if let Some(f) = self.before_set { + unsafe { + *c.core.before_set.get() = Some(f); + } + } + (c, s) + } +} + +pub type AnyResChannel = BaseChannel<(RaftCmdResponse, Option>)>; + +impl Debug for AnyResChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "AnyResChannel") + } +} + +impl ErrorCallback for AnyResChannel { + fn report_error(self, err: RaftCmdResponse) { + self.set_result((err, None)); + } + + fn is_none(&self) -> bool { + false + } +} + +pub type AnyResSubscriber = BaseSubscriber<(RaftCmdResponse, Option>)>; + +pub fn build_any_channel( + f: Box>)) + Send>, +) -> (AnyResChannel, AnyResSubscriber) { + let (c, s) = pair(); + unsafe { + *c.core.before_set.get() = Some(f); + } + (c, s) +} + +impl CmdResChannel { + // Valid range is [1, 30] + const PROPOSED_EVENT: u64 = 1; + const COMMITTED_EVENT: u64 = 2; + + /// Creates a pair of channel and subscriber. + #[inline] + pub fn pair() -> (Self, CmdResSubscriber) { + let tracker = Tracker { + write: TimeTracker::default(), + }; + Self::with_mask(u32::MAX, tracker) + } +} + +impl ErrorCallback for CmdResChannel { + fn report_error(self, err: RaftCmdResponse) { + self.set_result(err); + } + + fn is_none(&self) -> bool { + false + } +} + +impl WriteCallback for CmdResChannel { + type Response = RaftCmdResponse; + + /// Called after a request is proposed to the raft group successfully. It's + /// used to notify the caller to move on early because it's very likely the + /// request will be applied to the raftstore. + #[inline] + fn notify_proposed(&mut self) { + self.core.notify_event(Self::PROPOSED_EVENT); + } + + /// Called after a request is committed and before it's being applied, and + /// it's guaranteed that the request will be successfully applied soon. + #[inline] + fn notify_committed(&mut self) { + self.core.notify_event(Self::COMMITTED_EVENT); + } + + type TimeTrackerListRef<'a> = &'a [TimeTracker]; + #[inline] + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + std::slice::from_ref(unsafe { &(*self.core.tracker.get()).write }) + } + + type TimeTrackerListMut<'a> = &'a mut [TimeTracker]; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + std::slice::from_mut(unsafe { &mut (*self.core.tracker.get()).write }) + } + + // TODO: support executing hooks inside setting result. + #[inline] + fn set_result(self, res: RaftCmdResponse) { + self.set_result(res); + } +} + +/// Response for Read. +/// +/// Unlike v1, snapshot are always taken in LocalReader, hence snapshot doesn't +/// need to be a field of the struct. +#[derive(Clone, PartialEq, Debug)] +pub struct ReadResponse { + pub read_index: u64, + pub txn_extra_op: TxnExtraOp, +} + +impl ReadResponse { + pub fn new(read_index: u64) -> Self { + ReadResponse { + read_index, + txn_extra_op: TxnExtraOp::Noop, + } + } +} + +/// Possible result of a raft query. +#[derive(Clone, Debug, PartialEq)] +pub enum QueryResult { + /// If it's a read like get or snapshot, `ReadResponse` is returned on + /// success. + Read(ReadResponse), + /// If it's a status query, `RaftCmdResponse` is returned. If it's a read + /// like query, `RaftCmdResponse` is returned on error. + Response(RaftCmdResponse), +} + +impl QueryResult { + pub fn read(&self) -> Option<&ReadResponse> { + match self { + QueryResult::Read(r) => Some(r), + _ => None, + } + } + + pub fn response(&self) -> Option<&RaftCmdResponse> { + match self { + QueryResult::Response(r) => Some(r), + _ => None, + } + } +} + +pub type QueryResChannel = BaseChannel; + +impl QueryResChannel { + #[inline] + pub fn pair() -> (Self, QueryResSubscriber) { + pair() + } +} + +impl ErrorCallback for QueryResChannel { + #[inline] + fn report_error(self, err: RaftCmdResponse) { + self.set_result(QueryResult::Response(err)); + } + + #[inline] + fn is_none(&self) -> bool { + false + } +} + +impl ReadCallback for QueryResChannel { + type Response = QueryResult; + + #[inline] + fn set_result(self, res: QueryResult) { + self.set_result(res); + } + + fn read_tracker(&self) -> Option { + Some(unsafe { (*self.core.tracker.get()).read }) + } +} + +pub type QueryResSubscriber = BaseSubscriber; + +impl fmt::Debug for QueryResChannel { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "QueryResChannel") + } +} + +pub type DebugInfoChannel = BaseChannel; +pub type DebugInfoSubscriber = BaseSubscriber; + +impl DebugInfoChannel { + #[inline] + pub fn pair() -> (Self, DebugInfoSubscriber) { + pair() + } +} + +impl Debug for DebugInfoChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "DebugInfoChannel") + } +} + +#[cfg(feature = "testexport")] +mod flush_channel { + use super::*; + + pub type FlushChannel = BaseChannel<()>; + pub type FlushSubscriber = BaseSubscriber<()>; + + impl FlushChannel { + #[inline] + pub fn pair() -> (Self, FlushSubscriber) { + pair() + } + } + + impl Debug for FlushChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "FlushChannel") + } + } +} + +#[cfg(feature = "testexport")] +pub use flush_channel::{FlushChannel, FlushSubscriber}; + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use futures::{executor::block_on, StreamExt}; + + use super::*; + + #[test] + fn test_cancel() { + let (chan, mut sub) = CmdResChannel::pair(); + drop(chan); + assert!(!block_on(sub.wait_proposed())); + assert!(!block_on(sub.wait_committed())); + assert!(block_on(sub.result()).is_none()); + + let (mut chan, mut sub) = CmdResChannel::pair(); + chan.notify_proposed(); + let mut result = RaftCmdResponse::default(); + result.mut_header().set_current_term(4); + chan.set_result(result.clone()); + assert!(block_on(sub.wait_proposed())); + assert!(!block_on(sub.wait_committed())); + assert_eq!(block_on(sub.result()), Some(result)); + + let (chan, sub) = QueryResChannel::pair(); + drop(chan); + assert!(block_on(sub.result()).is_none()); + } + + #[test] + fn test_channel() { + let (mut chan, mut sub) = CmdResChannel::pair(); + chan.notify_proposed(); + chan.notify_committed(); + let mut result = RaftCmdResponse::default(); + result.mut_header().set_current_term(2); + chan.set_result(result.clone()); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + assert_eq!(block_on(sub.result()), Some(result.clone())); + + let (chan, sub) = QueryResChannel::pair(); + let resp = QueryResult::Response(result.clone()); + chan.set_result(resp.clone()); + assert_eq!(block_on(sub.result()).unwrap(), resp); + + let (chan, sub) = QueryResChannel::pair(); + let read = QueryResult::Read(ReadResponse { + read_index: 0, + txn_extra_op: TxnExtraOp::ReadOldValue, + }); + chan.set_result(read.clone()); + assert_eq!(block_on(sub.result()).unwrap(), read); + } + + #[test] + fn test_cmd_res_stream() { + let mut builder = CmdResChannelBuilder::default(); + builder.before_set(|res| { + res.mut_header().set_current_term(6); + }); + let (chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.set_result(RaftCmdResponse::default()); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Finished(res)) if res.get_header().get_current_term() == 6); + + // When using builder, no event is subscribed by default. + let (mut chan, sub) = CmdResChannelBuilder::default().build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + drop(chan); + assert_matches!(block_on(stream.next()), None); + + let mut builder = CmdResChannelBuilder::default(); + builder.subscribe_proposed(); + let (mut chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Proposed)); + drop(chan); + assert_matches!(block_on(stream.next()), None); + + let mut builder = CmdResChannelBuilder::default(); + builder.subscribe_committed(); + let (mut chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Committed)); + drop(chan); + assert_matches!(block_on(stream.next()), None); + } +} diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs new file mode 100644 index 00000000000..2fa7255afd3 --- /dev/null +++ b/components/raftstore-v2/src/worker/mod.rs @@ -0,0 +1,4 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod pd; +pub mod tablet; diff --git a/components/raftstore-v2/src/worker/pd/misc.rs b/components/raftstore-v2/src/worker/pd/misc.rs new file mode 100644 index 00000000000..68c624b089a --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/misc.rs @@ -0,0 +1,123 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{atomic::Ordering, Arc}, + time::{Duration, Instant}, +}; + +use causal_ts::CausalTsProvider; +use engine_traits::{KvEngine, RaftEngine}; +use futures::{compat::Future01CompatExt, FutureExt}; +use pd_client::PdClient; +use raftstore::{store::TxnExt, Result}; +use slog::{info, warn}; +use tikv_util::{box_err, timer::GLOBAL_TIMER_HANDLE}; +use txn_types::TimeStamp; + +use super::Runner; + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_update_max_timestamp( + &mut self, + region_id: u64, + initial_status: u64, + txn_ext: Arc, + ) { + let pd_client = self.pd_client.clone(); + let concurrency_manager = self.concurrency_manager.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); + let logger = self.logger.clone(); + let shutdown = self.shutdown.clone(); + + let f = async move { + let mut success = false; + while txn_ext.max_ts_sync_status.load(Ordering::SeqCst) == initial_status + && !shutdown.load(Ordering::Relaxed) + { + // On leader transfer / region merge, RawKV API v2 need to + // invoke causal_ts_provider.flush() to renew + // cached TSO, to ensure that the next TSO + // returned by causal_ts_provider.get_ts() on current + // store must be larger than the store where the leader is on + // before. + // + // And it won't break correctness of transaction commands, as + // causal_ts_provider.flush() is implemented as + // pd_client.get_tso() + renew TSO cached. + let res: Result = if let Some(causal_ts_provider) = &causal_ts_provider { + causal_ts_provider + .async_flush() + .await + .map_err(|e| box_err!(e)) + } else { + pd_client.get_tso().await.map_err(Into::into) + }; + + match res { + Ok(ts) => { + concurrency_manager.update_max_ts(ts); + success = txn_ext + .max_ts_sync_status + .compare_exchange( + initial_status, + initial_status | 1, + Ordering::SeqCst, + Ordering::SeqCst, + ) + .is_ok(); + break; + } + Err(e) => { + warn!( + logger, + "failed to update max timestamp for region {}: {:?}", region_id, e + ); + } + } + } + + if success { + info!(logger, "succeed to update max timestamp"; "region_id" => region_id); + } else { + info!( + logger, + "updating max timestamp is stale"; + "region_id" => region_id, + "initial_status" => initial_status, + ); + } + }; + + let delay = (|| { + fail::fail_point!("delay_update_max_ts", |_| true); + false + })(); + + if delay { + info!(self.logger, "[failpoint] delay update max ts for 1s"; "region_id" => region_id); + let deadline = Instant::now() + Duration::from_secs(1); + self.remote + .spawn(GLOBAL_TIMER_HANDLE.delay(deadline).compat().then(|_| f)); + } else { + self.remote.spawn(f); + } + } + + pub fn handle_report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) { + let resp = self + .pd_client + .report_min_resolved_ts(store_id, min_resolved_ts); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report min resolved_ts failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } +} diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs new file mode 100644 index 00000000000..e06d161fe08 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -0,0 +1,466 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + sync::{atomic::AtomicBool, Arc}, +}; + +use causal_ts::CausalTsProviderImpl; +use collections::HashMap; +use concurrency_manager::ConcurrencyManager; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use kvproto::{metapb, pdpb}; +use pd_client::{BucketStat, PdClient}; +use raftstore::store::{ + util::KeysInfoFormatter, AutoSplitController, Config, FlowStatsReporter, PdStatsMonitor, + ReadStats, RegionReadProgressRegistry, SplitInfo, StoreStatsReporter, TabletSnapManager, + TxnExt, WriteStats, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, +}; +use resource_metering::{Collector, CollectorRegHandle, RawRecords}; +use slog::{error, Logger}; +use tikv_util::{ + config::VersionTrack, + time::UnixSecs, + worker::{Runnable, Scheduler}, +}; +use yatp::{task::future::TaskCell, Remote}; + +use crate::{ + batch::StoreRouter, + router::{CmdResChannel, PeerMsg}, +}; + +mod misc; +mod region; +mod split; +mod store; + +pub use region::RegionHeartbeatTask; + +type RecordPairVec = Vec; + +pub enum Task { + // In store.rs. + StoreHeartbeat { + stats: pdpb::StoreStats, + // TODO: StoreReport, StoreDrAutoSyncStatus + }, + UpdateStoreInfos { + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + }, + // In region.rs. + RegionHeartbeat(RegionHeartbeatTask), + ReportRegionBuckets(BucketStat), + UpdateReadStats(ReadStats), + UpdateWriteStats(WriteStats), + UpdateRegionCpuRecords(Arc), + DestroyPeer { + region_id: u64, + }, + // In split.rs. + AskBatchSplit { + region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ch: CmdResChannel, + }, + ReportBatchSplit { + regions: Vec, + }, + AutoSplit { + split_infos: Vec, + }, + // In misc.rs. + UpdateMaxTimestamp { + region_id: u64, + initial_status: u64, + txn_ext: Arc, + }, + ReportBuckets(BucketStat), + ReportMinResolvedTs { + store_id: u64, + min_resolved_ts: u64, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::StoreHeartbeat { ref stats, .. } => { + write!(f, "store heartbeat stats: {stats:?}") + } + Task::UpdateStoreInfos { + ref cpu_usages, + ref read_io_rates, + ref write_io_rates, + } => write!( + f, + "get store's information: cpu_usages {:?}, read_io_rates {:?}, write_io_rates {:?}", + cpu_usages, read_io_rates, write_io_rates, + ), + Task::RegionHeartbeat(ref hb_task) => write!( + f, + "region heartbeat for region {:?}, leader {}", + hb_task.region, + hb_task.peer.get_id(), + ), + Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), + Task::UpdateReadStats(ref stats) => { + write!(f, "update read stats: {stats:?}") + } + Task::UpdateWriteStats(ref stats) => { + write!(f, "update write stats: {stats:?}") + } + Task::UpdateRegionCpuRecords(ref cpu_records) => { + write!(f, "get region cpu records: {:?}", cpu_records) + } + Task::DestroyPeer { ref region_id } => { + write!(f, "destroy peer of region {}", region_id) + } + Task::AskBatchSplit { + ref region, + ref split_keys, + .. + } => write!( + f, + "ask split region {} with {}", + region.get_id(), + KeysInfoFormatter(split_keys.iter()) + ), + Task::ReportBatchSplit { ref regions } => write!(f, "report split {:?}", regions), + Task::AutoSplit { ref split_infos } => { + write!(f, "auto split split regions, num is {}", split_infos.len()) + } + Task::UpdateMaxTimestamp { region_id, .. } => write!( + f, + "update the max timestamp for region {} in the concurrency manager", + region_id + ), + Task::ReportBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => write!( + f, + "report min resolved ts: store {}, resolved ts {}", + store_id, min_resolved_ts, + ), + } + } +} + +pub struct Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + store_id: u64, + pd_client: Arc, + raft_engine: ER, + tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, + router: StoreRouter, + stats_monitor: PdStatsMonitor, + + remote: Remote, + + // For store. + start_ts: UnixSecs, + store_stat: store::StoreStat, + + // For region. + region_peers: HashMap, + region_buckets: HashMap, + // region_id -> total_cpu_time_ms (since last region heartbeat) + region_cpu_records: HashMap, + is_hb_receiver_scheduled: bool, + + // For update_max_timestamp. + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, + + logger: Logger, + shutdown: Arc, + cfg: Arc>, +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn new( + store_id: u64, + pd_client: Arc, + raft_engine: ER, + tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, + router: StoreRouter, + remote: Remote, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + pd_scheduler: Scheduler, + auto_split_controller: AutoSplitController, + region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, + logger: Logger, + shutdown: Arc, + cfg: Arc>, + ) -> Result { + let mut stats_monitor = PdStatsMonitor::new( + cfg.value().pd_store_heartbeat_tick_interval.0 / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + cfg.value().report_min_resolved_ts_interval.0, + PdReporter::new(pd_scheduler, logger.clone()), + ); + stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + )?; + Ok(Self { + store_id, + pd_client, + raft_engine, + tablet_registry, + snap_mgr, + router, + stats_monitor, + remote, + start_ts: UnixSecs::zero(), + store_stat: store::StoreStat::default(), + region_peers: HashMap::default(), + region_buckets: HashMap::default(), + region_cpu_records: HashMap::default(), + is_hb_receiver_scheduled: false, + concurrency_manager, + causal_ts_provider, + logger, + shutdown, + cfg, + }) + } +} + +impl Runnable for Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + self.maybe_schedule_heartbeat_receiver(); + match task { + Task::StoreHeartbeat { stats } => self.handle_store_heartbeat(stats), + Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), + Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), + Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), + Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), + Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), + Task::DestroyPeer { region_id } => self.handle_destroy_peer(region_id), + Task::AskBatchSplit { + region, + split_keys, + peer, + right_derive, + ch, + } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), + Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), + Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), + Task::UpdateMaxTimestamp { + region_id, + initial_status, + txn_ext, + } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + Task::ReportBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => self.handle_report_min_resolved_ts(store_id, min_resolved_ts), + } + } +} + +#[derive(Clone)] +pub struct PdReporter { + scheduler: Scheduler, + logger: Logger, +} + +impl PdReporter { + pub fn new(scheduler: Scheduler, logger: Logger) -> Self { + PdReporter { scheduler, logger } + } +} + +impl FlowStatsReporter for PdReporter { + fn report_read_stats(&self, stats: ReadStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateReadStats(stats)) { + error!(self.logger, "Failed to send read flow statistics"; "err" => ?e); + } + } + + fn report_write_stats(&self, stats: WriteStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateWriteStats(stats)) { + error!(self.logger, "Failed to send write flow statistics"; "err" => ?e); + } + } +} + +impl Collector for PdReporter { + fn collect(&self, records: Arc) { + self.scheduler + .schedule(Task::UpdateRegionCpuRecords(records)) + .ok(); + } +} + +impl StoreStatsReporter for PdReporter { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send store infos to pd worker"; + "err" => ?e, + ); + } + } + + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } + } + + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } + } +} + +mod requests { + use kvproto::raft_cmdpb::{ + AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, + }; + use raft::eraftpb::ConfChangeType; + + use super::*; + use crate::router::RaftRequest; + + pub fn send_admin_request( + logger: &Logger, + router: &StoreRouter, + region_id: u64, + epoch: metapb::RegionEpoch, + peer: metapb::Peer, + request: AdminRequest, + ch: Option, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let cmd_type = request.get_cmd_type(); + + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(epoch); + req.mut_header().set_peer(peer); + req.set_admin_request(request); + + let msg = match ch { + Some(ch) => PeerMsg::AdminCommand(RaftRequest::new(req, ch)), + None => PeerMsg::admin_command(req).0, + }; + if let Err(e) = router.send(region_id, msg) { + error!( + logger, + "send request failed"; + "region_id" => region_id, "cmd_type" => ?cmd_type, "err" => ?e, + ); + } + } + + pub fn new_change_peer_request( + change_type: ConfChangeType, + peer: metapb::Peer, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeer); + req.mut_change_peer().set_change_type(change_type); + req.mut_change_peer().set_peer(peer); + req + } + + pub fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeerV2); + let change_peer_reqs = changes + .into_iter() + .map(|mut c| { + let mut cp = ChangePeerRequest::default(); + cp.set_change_type(c.get_change_type()); + cp.set_peer(c.take_peer()); + cp + }) + .collect(); + let mut cp = ChangePeerV2Request::default(); + cp.set_changes(change_peer_reqs); + req.set_change_peer_v2(cp); + req + } + + pub fn new_transfer_leader_request( + peer: metapb::Peer, + peers: Vec, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::TransferLeader); + req.mut_transfer_leader().set_peer(peer); + req.mut_transfer_leader().set_peers(peers.into()); + req + } + + pub fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::PrepareMerge); + req.mut_prepare_merge() + .set_target(merge.get_target().to_owned()); + req + } +} diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs new file mode 100644 index 00000000000..e825dd54c32 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -0,0 +1,454 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{metapb, pdpb}; +use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat}; +use raftstore::store::{ReadStats, WriteStats}; +use resource_metering::RawRecords; +use slog::{debug, error, info}; +use tikv_util::{store::QueryStats, time::UnixSecs}; + +use super::{requests::*, Runner}; +use crate::{ + operation::{RequestHalfSplit, RequestSplit}, + router::{CmdResChannel, PeerMsg}, +}; + +pub struct RegionHeartbeatTask { + pub term: u64, + pub region: metapb::Region, + pub peer: metapb::Peer, + pub down_peers: Vec, + pub pending_peers: Vec, + pub written_bytes: u64, + pub written_keys: u64, + pub approximate_size: Option, + pub approximate_keys: Option, + pub wait_data_peers: Vec, + // TODO: RegionReplicationStatus +} + +#[derive(Default)] +pub struct PeerStat { + pub read_bytes: u64, + pub read_keys: u64, + pub query_stats: QueryStats, + // last_region_report_attributes records the state of the last region heartbeat + pub last_region_report_read_bytes: u64, + pub last_region_report_read_keys: u64, + pub last_region_report_query_stats: QueryStats, + pub last_region_report_written_bytes: u64, + pub last_region_report_written_keys: u64, + pub last_region_report_ts: UnixSecs, + // last_store_report_attributes records the state of the last store heartbeat + pub last_store_report_read_bytes: u64, + pub last_store_report_read_keys: u64, + pub last_store_report_query_stats: QueryStats, + pub approximate_keys: u64, + pub approximate_size: u64, +} + +#[derive(Default)] +pub struct ReportBucket { + current_stat: BucketStat, + last_report_stat: Option, + last_report_ts: UnixSecs, +} + +impl ReportBucket { + fn new(current_stat: BucketStat) -> Self { + Self { + current_stat, + ..Default::default() + } + } + + fn report(&mut self) -> BucketStat { + match self.last_report_stat.replace(self.current_stat.clone()) { + Some(last) => { + let mut delta = BucketStat::from_meta(self.current_stat.meta.clone()); + // Buckets may be changed, recalculate last stats according to current meta. + delta.merge(&last); + for i in 0..delta.meta.keys.len() - 1 { + delta.stats.write_bytes[i] = + self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; + delta.stats.write_keys[i] = + self.current_stat.stats.write_keys[i] - delta.stats.write_keys[i]; + delta.stats.write_qps[i] = + self.current_stat.stats.write_qps[i] - delta.stats.write_qps[i]; + + delta.stats.read_bytes[i] = + self.current_stat.stats.read_bytes[i] - delta.stats.read_bytes[i]; + delta.stats.read_keys[i] = + self.current_stat.stats.read_keys[i] - delta.stats.read_keys[i]; + delta.stats.read_qps[i] = + self.current_stat.stats.read_qps[i] - delta.stats.read_qps[i]; + } + delta + } + None => self.current_stat.clone(), + } + } +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_region_heartbeat(&mut self, task: RegionHeartbeatTask) { + // HACK! In order to keep the compatible of protos, we use 0 to identify + // the size uninitialized regions, and use 1 to identify the empty regions. + // + // See tikv/tikv#11114 for details. + let approximate_size = match task.approximate_size { + Some(0) => 1, + Some(v) => v, + None => 0, // size uninitialized + }; + let approximate_keys = task.approximate_keys.unwrap_or_default(); + let region_id = task.region.get_id(); + + let peer_stat = self + .region_peers + .entry(region_id) + .or_insert_with(PeerStat::default); + peer_stat.approximate_size = approximate_size; + peer_stat.approximate_keys = approximate_keys; + + let read_bytes_delta = peer_stat.read_bytes - peer_stat.last_region_report_read_bytes; + let read_keys_delta = peer_stat.read_keys - peer_stat.last_region_report_read_keys; + let written_bytes_delta = task.written_bytes - peer_stat.last_region_report_written_bytes; + let written_keys_delta = task.written_keys - peer_stat.last_region_report_written_keys; + let query_stats = peer_stat + .query_stats + .sub_query_stats(&peer_stat.last_region_report_query_stats); + let mut last_report_ts = peer_stat.last_region_report_ts; + if last_report_ts.is_zero() { + last_report_ts = self.start_ts; + } + peer_stat.last_region_report_written_bytes = task.written_bytes; + peer_stat.last_region_report_written_keys = task.written_keys; + peer_stat.last_region_report_read_bytes = peer_stat.read_bytes; + peer_stat.last_region_report_read_keys = peer_stat.read_keys; + peer_stat.last_region_report_query_stats = peer_stat.query_stats.clone(); + let unix_secs_now = UnixSecs::now(); + peer_stat.last_region_report_ts = unix_secs_now; + + // Calculate the CPU usage since the last region heartbeat. + let cpu_usage = { + // Take out the region CPU record. + let cpu_time_duration = Duration::from_millis( + self.region_cpu_records.remove(®ion_id).unwrap_or(0) as u64, + ); + let interval_second = unix_secs_now.into_inner() - last_report_ts.into_inner(); + // Keep consistent with the calculation of cpu_usages in a store heartbeat. + // See components/tikv_util/src/metrics/threads_linux.rs for more details. + if interval_second > 0 { + ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) as u64 + } else { + 0 + } + }; + + let region_stat = RegionStat { + down_peers: task.down_peers, + pending_peers: task.pending_peers, + written_bytes: written_bytes_delta, + written_keys: written_keys_delta, + read_bytes: read_bytes_delta, + read_keys: read_keys_delta, + query_stats: query_stats.0, + approximate_size, + approximate_keys, + last_report_ts, + cpu_usage, + }; + self.store_stat + .region_bytes_written + .observe(region_stat.written_bytes as f64); + self.store_stat + .region_keys_written + .observe(region_stat.written_keys as f64); + self.store_stat + .region_bytes_read + .observe(region_stat.read_bytes as f64); + self.store_stat + .region_keys_read + .observe(region_stat.read_keys as f64); + + let resp = self.pd_client.region_heartbeat( + task.term, + task.region.clone(), + task.peer, + region_stat, + None, + ); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send heartbeat"; + "region_id" => task.region.get_id(), + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn maybe_schedule_heartbeat_receiver(&mut self) { + if self.is_hb_receiver_scheduled { + return; + } + let router = self.router.clone(); + let store_id = self.store_id; + let logger = self.logger.clone(); + + let fut = + self.pd_client + .handle_region_heartbeat_response(self.store_id, move |mut resp| { + let region_id = resp.get_region_id(); + let epoch = resp.take_region_epoch(); + let peer = resp.take_target_peer(); + + if resp.has_change_peer() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["change peer"]) + .inc(); + + let mut change_peer = resp.take_change_peer(); + info!( + logger, + "try to change peer"; + "region_id" => region_id, + "change_type" => ?change_peer.get_change_type(), + "peer" => ?change_peer.get_peer() + ); + let req = new_change_peer_request( + change_peer.get_change_type(), + change_peer.take_peer(), + ); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); + } else if resp.has_change_peer_v2() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["change peer"]) + .inc(); + + let mut change_peer_v2 = resp.take_change_peer_v2(); + info!( + logger, + "try to change peer"; + "region_id" => region_id, + "changes" => ?change_peer_v2.get_changes(), + ); + let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); + } else if resp.has_transfer_leader() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["transfer leader"]) + .inc(); + + let mut transfer_leader = resp.take_transfer_leader(); + info!( + logger, + "try to transfer leader"; + "region_id" => region_id, + "from_peer" => ?peer, + "to_peer" => ?transfer_leader.get_peer(), + "to_peers" => ?transfer_leader.get_peers(), + ); + let req = new_transfer_leader_request( + transfer_leader.take_peer(), + transfer_leader.take_peers().into(), + ); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); + } else if resp.has_split_region() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["split region"]) + .inc(); + + let mut split_region = resp.take_split_region(); + info!( + logger, + "try to split"; + "region_id" => region_id, + "region_epoch" => ?epoch, + ); + + let (ch, _) = CmdResChannel::pair(); + let msg = if split_region.get_policy() == pdpb::CheckPolicy::Usekey { + PeerMsg::RequestSplit { + request: RequestSplit { + epoch, + split_keys: split_region.take_keys().into(), + source: "pd".into(), + }, + ch, + } + } else { + PeerMsg::RequestHalfSplit { + request: RequestHalfSplit { + epoch, + start_key: None, + end_key: None, + policy: split_region.get_policy(), + source: "pd".into(), + }, + ch, + } + }; + if let Err(e) = router.send(region_id, msg) { + error!(logger, + "send split request failed"; + "region_id" => region_id, + "err" => ?e + ); + } + } else if resp.has_merge() { + PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["merge"]).inc(); + + let merge = resp.take_merge(); + info!(logger, "try to merge"; "region_id" => region_id, "merge" => ?merge); + let req = new_merge_request(merge); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); + } else { + PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); + } + }); + let logger = self.logger.clone(); + let f = async move { + match fut.await { + Ok(_) => { + info!( + logger, + "region heartbeat response handler exit"; + "store_id" => store_id, + ); + } + Err(e) => panic!("unexpected error: {:?}", e), + } + }; + self.remote.spawn(f); + self.is_hb_receiver_scheduled = true; + } + + pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { + let region_id = region_buckets.meta.region_id; + self.merge_buckets(region_buckets); + let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); + let last_report_ts = if report_buckets.last_report_ts.is_zero() { + self.start_ts + } else { + report_buckets.last_report_ts + }; + let now = UnixSecs::now(); + let interval_second = now.into_inner() - last_report_ts.into_inner(); + report_buckets.last_report_ts = now; + let delta = report_buckets.report(); + let resp = self + .pd_client + .report_region_buckets(&delta, Duration::from_secs(interval_second)); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send buckets"; + "region_id" => region_id, + "version" => delta.meta.version, + "region_epoch" => ?delta.meta.region_epoch, + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn handle_update_read_stats(&mut self, mut stats: ReadStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.read_bytes += region_info.flow.read_bytes as u64; + peer_stat.read_keys += region_info.flow.read_keys as u64; + self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; + self.store_stat.engine_total_keys_read += region_info.flow.read_keys as u64; + peer_stat + .query_stats + .add_query_stats(®ion_info.query_stats.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.query_stats.0); + } + for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(region_buckets); + } + if !stats.region_infos.is_empty() { + self.stats_monitor.maybe_send_read_stats(stats); + } + } + + pub fn handle_update_write_stats(&mut self, mut stats: WriteStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.query_stats.add_query_stats(®ion_info.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.0); + } + } + + pub fn handle_update_region_cpu_records(&mut self, records: Arc) { + // Send Region CPU info to AutoSplitController inside the stats_monitor. + self.stats_monitor.maybe_send_cpu_stats(&records); + Self::calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); + } + + pub fn handle_destroy_peer(&mut self, region_id: u64) { + match self.region_peers.remove(®ion_id) { + None => {} + Some(_) => { + info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) + } + } + } + + fn merge_buckets(&mut self, mut buckets: BucketStat) { + let region_id = buckets.meta.region_id; + self.region_buckets + .entry(region_id) + .and_modify(|report_bucket| { + let current = &mut report_bucket.current_stat; + if current.meta < buckets.meta { + std::mem::swap(current, &mut buckets); + } + current.merge(&buckets); + }) + .or_insert_with(|| ReportBucket::new(buckets)); + } + + fn calculate_region_cpu_records( + store_id: u64, + records: Arc, + region_cpu_records: &mut HashMap, + ) { + for (tag, record) in &records.records { + let record_store_id = tag.store_id; + if record_store_id != store_id { + continue; + } + // Reporting a region heartbeat later will clear the corresponding record. + *region_cpu_records.entry(tag.region_id).or_insert(0) += record.cpu_time; + } + } +} diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs new file mode 100644 index 00000000000..bf13e01120a --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -0,0 +1,162 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{AdminCmdType, AdminRequest, SplitRequest}, +}; +use pd_client::PdClient; +use raftstore::store::SplitInfo; +use slog::{info, warn, Logger}; +use yatp::{task::future::TaskCell, Remote}; + +use super::{requests::*, Runner}; +use crate::{batch::StoreRouter, router::CmdResChannel}; + +fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, +) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + #[inline] + pub fn handle_ask_batch_split( + &mut self, + region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ch: CmdResChannel, + ) { + Self::ask_batch_split_imp( + &self.pd_client, + &self.logger, + &self.router, + &self.remote, + region, + split_keys, + peer, + right_derive, + Some(ch), + ); + } + + fn ask_batch_split_imp( + pd_client: &T, + logger: &Logger, + router: &StoreRouter, + remote: &Remote, + mut region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ch: Option, + ) { + if split_keys.is_empty() { + info!( + logger, + "empty split key, skip ask batch split"; + "region_id" => region.get_id() + ); + return; + } + let resp = pd_client.ask_batch_split(region.clone(), split_keys.len()); + let router = router.clone(); + let logger = logger.clone(); + let f = async move { + match resp.await { + Ok(mut resp) => { + info!( + logger, + "try to batch split region"; + "region_id" => region.get_id(), + "new_region_ids" => ?resp.get_ids(), + "region" => ?region, + ); + + let req = new_batch_split_region_request( + split_keys, + resp.take_ids().into(), + right_derive, + ); + let region_id = region.get_id(); + let epoch = region.take_region_epoch(); + send_admin_request(&logger, &router, region_id, epoch, peer, req, ch); + } + Err(e) => { + warn!( + logger, + "ask batch split failed"; + "region_id" => region.get_id(), + "err" => ?e, + ); + } + } + }; + remote.spawn(f); + } + + pub fn handle_report_batch_split(&mut self, regions: Vec) { + let resp = self.pd_client.report_batch_split(regions); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report split failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } + + pub fn handle_auto_split(&mut self, split_infos: Vec) { + let pd_client = self.pd_client.clone(); + let logger = self.logger.clone(); + let router = self.router.clone(); + let remote = self.remote.clone(); + + let f = async move { + for split_info in split_infos { + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::ask_batch_split_imp( + &pd_client, + &logger, + &router, + &remote, + region, + vec![split_key], + split_info.peer, + true, + None, + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + // TODO: implement half split + } + } + }; + self.remote.spawn(f); + } +} diff --git a/components/raftstore-v2/src/worker/pd/store.rs b/components/raftstore-v2/src/worker/pd/store.rs new file mode 100644 index 00000000000..8f30b85d6f3 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -0,0 +1,311 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::pdpb; +use pd_client::{ + metrics::{ + REGION_READ_BYTES_HISTOGRAM, REGION_READ_KEYS_HISTOGRAM, REGION_WRITTEN_BYTES_HISTOGRAM, + REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_GAUGE_VEC, + }, + PdClient, +}; +use prometheus::local::LocalHistogram; +use slog::{error, warn}; +use tikv_util::{metrics::RecordPairVec, store::QueryStats, time::UnixSecs, topn::TopN}; + +use super::Runner; + +const HOTSPOT_REPORT_CAPACITY: usize = 1000; + +fn hotspot_key_report_threshold() -> u64 { + const HOTSPOT_KEY_RATE_THRESHOLD: u64 = 128; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_KEY_RATE_THRESHOLD * 10 +} + +fn hotspot_byte_report_threshold() -> u64 { + const HOTSPOT_BYTE_RATE_THRESHOLD: u64 = 8 * 1024; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_BYTE_RATE_THRESHOLD * 10 +} + +fn hotspot_query_num_report_threshold() -> u64 { + const HOTSPOT_QUERY_RATE_THRESHOLD: u64 = 128; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_QUERY_RATE_THRESHOLD * 10 +} + +pub struct StoreStat { + pub engine_total_bytes_read: u64, + pub engine_total_keys_read: u64, + pub engine_total_query_num: QueryStats, + pub engine_last_total_bytes_read: u64, + pub engine_last_total_keys_read: u64, + pub engine_last_query_num: QueryStats, + pub last_report_ts: UnixSecs, + + pub region_bytes_read: LocalHistogram, + pub region_keys_read: LocalHistogram, + pub region_bytes_written: LocalHistogram, + pub region_keys_written: LocalHistogram, + + pub store_cpu_usages: RecordPairVec, + pub store_read_io_rates: RecordPairVec, + pub store_write_io_rates: RecordPairVec, +} + +impl Default for StoreStat { + fn default() -> StoreStat { + StoreStat { + region_bytes_read: REGION_READ_BYTES_HISTOGRAM.local(), + region_keys_read: REGION_READ_KEYS_HISTOGRAM.local(), + region_bytes_written: REGION_WRITTEN_BYTES_HISTOGRAM.local(), + region_keys_written: REGION_WRITTEN_KEYS_HISTOGRAM.local(), + + last_report_ts: UnixSecs::zero(), + engine_total_bytes_read: 0, + engine_total_keys_read: 0, + engine_last_total_bytes_read: 0, + engine_last_total_keys_read: 0, + engine_total_query_num: QueryStats::default(), + engine_last_query_num: QueryStats::default(), + + store_cpu_usages: RecordPairVec::default(), + store_read_io_rates: RecordPairVec::default(), + store_write_io_rates: RecordPairVec::default(), + } + } +} + +#[derive(Default, Clone)] +struct PeerCmpReadStat { + pub region_id: u64, + pub report_stat: u64, +} + +impl Ord for PeerCmpReadStat { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.report_stat.cmp(&other.report_stat) + } +} + +impl Eq for PeerCmpReadStat {} + +impl PartialEq for PeerCmpReadStat { + fn eq(&self, other: &Self) -> bool { + self.report_stat == other.report_stat + } +} + +impl PartialOrd for PeerCmpReadStat { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.report_stat.cmp(&other.report_stat)) + } +} + +fn collect_report_read_peer_stats( + capacity: usize, + mut report_read_stats: HashMap, + mut stats: pdpb::StoreStats, +) -> pdpb::StoreStats { + if report_read_stats.len() < capacity * 3 { + for (_, read_stat) in report_read_stats { + stats.peer_stats.push(read_stat); + } + return stats; + } + let mut keys_topn_report = TopN::new(capacity); + let mut bytes_topn_report = TopN::new(capacity); + let mut stats_topn_report = TopN::new(capacity); + for read_stat in report_read_stats.values() { + let mut cmp_stat = PeerCmpReadStat::default(); + cmp_stat.region_id = read_stat.region_id; + let mut key_cmp_stat = cmp_stat.clone(); + key_cmp_stat.report_stat = read_stat.read_keys; + keys_topn_report.push(key_cmp_stat); + let mut byte_cmp_stat = cmp_stat.clone(); + byte_cmp_stat.report_stat = read_stat.read_bytes; + bytes_topn_report.push(byte_cmp_stat); + let mut query_cmp_stat = cmp_stat.clone(); + query_cmp_stat.report_stat = get_read_query_num(read_stat.get_query_stats()); + stats_topn_report.push(query_cmp_stat); + } + + for x in keys_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + + for x in bytes_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + + for x in stats_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + stats +} + +fn get_read_query_num(stat: &pdpb::QueryStats) -> u64 { + stat.get_get() + stat.get_coprocessor() + stat.get_scan() +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_store_heartbeat(&mut self, mut stats: pdpb::StoreStats) { + let mut report_peers = HashMap::default(); + for (region_id, region_peer) in &mut self.region_peers { + let read_bytes = region_peer.read_bytes - region_peer.last_store_report_read_bytes; + let read_keys = region_peer.read_keys - region_peer.last_store_report_read_keys; + let query_stats = region_peer + .query_stats + .sub_query_stats(®ion_peer.last_store_report_query_stats); + region_peer.last_store_report_read_bytes = region_peer.read_bytes; + region_peer.last_store_report_read_keys = region_peer.read_keys; + region_peer + .last_store_report_query_stats + .fill_query_stats(®ion_peer.query_stats); + if read_bytes < hotspot_byte_report_threshold() + && read_keys < hotspot_key_report_threshold() + && query_stats.get_read_query_num() < hotspot_query_num_report_threshold() + { + continue; + } + let mut read_stat = pdpb::PeerStat::default(); + read_stat.set_region_id(*region_id); + read_stat.set_read_keys(read_keys); + read_stat.set_read_bytes(read_bytes); + read_stat.set_query_stats(query_stats.0); + report_peers.insert(*region_id, read_stat); + } + + stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); + let (capacity, used_size, available) = self.collect_engine_size().unwrap_or_default(); + if available == 0 { + warn!(self.logger, "no available space"); + } + + stats.set_capacity(capacity); + stats.set_used_size(used_size); + stats.set_available(available); + stats.set_bytes_read( + self.store_stat.engine_total_bytes_read - self.store_stat.engine_last_total_bytes_read, + ); + stats.set_keys_read( + self.store_stat.engine_total_keys_read - self.store_stat.engine_last_total_keys_read, + ); + + self.store_stat + .engine_total_query_num + .add_query_stats(stats.get_query_stats()); // add write query stat + let res = self + .store_stat + .engine_total_query_num + .sub_query_stats(&self.store_stat.engine_last_query_num); + stats.set_query_stats(res.0); + + stats.set_cpu_usages(self.store_stat.store_cpu_usages.clone().into()); + stats.set_read_io_rates(self.store_stat.store_read_io_rates.clone().into()); + stats.set_write_io_rates(self.store_stat.store_write_io_rates.clone().into()); + + let mut interval = pdpb::TimeInterval::default(); + interval.set_start_timestamp(self.store_stat.last_report_ts.into_inner()); + stats.set_interval(interval); + self.store_stat.engine_last_total_bytes_read = self.store_stat.engine_total_bytes_read; + self.store_stat.engine_last_total_keys_read = self.store_stat.engine_total_keys_read; + self.store_stat + .engine_last_query_num + .fill_query_stats(&self.store_stat.engine_total_query_num); + self.store_stat.last_report_ts = UnixSecs::now(); + self.store_stat.region_bytes_written.flush(); + self.store_stat.region_keys_written.flush(); + self.store_stat.region_bytes_read.flush(); + self.store_stat.region_keys_read.flush(); + + STORE_SIZE_GAUGE_VEC + .with_label_values(&["capacity"]) + .set(capacity as i64); + STORE_SIZE_GAUGE_VEC + .with_label_values(&["available"]) + .set(available as i64); + STORE_SIZE_GAUGE_VEC + .with_label_values(&["used"]) + .set(used_size as i64); + + // TODO: slow score + + let resp = self.pd_client.store_heartbeat(stats, None, None); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + error!(logger, "store heartbeat failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } + + pub fn handle_update_store_infos( + &mut self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + self.store_stat.store_cpu_usages = cpu_usages; + self.store_stat.store_read_io_rates = read_io_rates; + self.store_stat.store_write_io_rates = write_io_rates; + } + + /// Returns (capacity, used, available). + fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { + let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { + Err(e) => { + error!( + self.logger, + "get disk stat for rocksdb failed"; + "engine_path" => self.tablet_registry.tablet_root().display(), + "err" => ?e + ); + return None; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + let capacity = if self.cfg.value().capacity.0 == 0 { + disk_cap + } else { + std::cmp::min(disk_cap, self.cfg.value().capacity.0) + }; + let mut kv_size = 0; + self.tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + let snap_size = self.snap_mgr.total_snap_size().unwrap(); + let used_size = snap_size + + kv_size + + self + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + // We only care about rocksdb SST file size, so we should check disk available + // here. + available = cmp::min(available, disk_stats.available_space()); + Some((capacity, used_size, available)) + } +} diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs new file mode 100644 index 00000000000..db09c4ba3be --- /dev/null +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -0,0 +1,484 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; + +use collections::HashMap; +use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry, DATA_CFS}; +use kvproto::{import_sstpb::SstMeta, metapb::Region}; +use slog::{debug, error, info, warn, Logger}; +use sst_importer::SstImporter; +use tikv_util::{ + time::Instant, + worker::{Runnable, RunnableWithTimer}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, + Either, +}; + +const DEFAULT_BACKGROUND_POOL_SIZE: usize = 6; + +pub enum Task { + Trim { + tablet: EK, + start_key: Box<[u8]>, + end_key: Box<[u8]>, + cb: Box, + }, + PrepareDestroy { + // A path is passed only when the db is never opened. + tablet: Either, + region_id: u64, + wait_for_persisted: u64, + }, + Destroy { + region_id: u64, + persisted_index: u64, + }, + /// Sometimes we know for sure a tablet can be destroyed directly. + DirectDestroy { tablet: Either }, + /// Cleanup ssts. + CleanupImportSst(Box<[SstMeta]>), + /// Flush memtable before split + /// + /// cb is some iff the task is sent from leader, it is used to real propose + /// split when flush finishes + Flush { + region_id: u64, + cb: Option>, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Task::Trim { + start_key, end_key, .. + } => write!( + f, + "trim tablet for start_key {}, end_key {}", + log_wrappers::Value::key(start_key), + log_wrappers::Value::key(end_key), + ), + Task::PrepareDestroy { + region_id, + wait_for_persisted, + .. + } => write!( + f, + "prepare destroy tablet for region_id {}, wait_for_persisted {}", + region_id, wait_for_persisted, + ), + Task::Destroy { + region_id, + persisted_index, + } => write!( + f, + "destroy tablet for region_id {} persisted_index {}", + region_id, persisted_index, + ), + Task::DirectDestroy { .. } => { + write!(f, "direct destroy tablet") + } + Task::CleanupImportSst(ssts) => { + write!(f, "cleanup import ssts {:?}", ssts) + } + Task::Flush { + region_id, + cb: on_flush_finish, + } => { + write!( + f, + "flush tablet for region_id {}, is leader {}", + region_id, + on_flush_finish.is_some() + ) + } + } + } +} + +impl Task { + #[inline] + pub fn trim(tablet: EK, region: &Region, cb: impl FnOnce() + Send + 'static) -> Self { + Task::Trim { + tablet, + start_key: region.get_start_key().into(), + end_key: region.get_end_key().into(), + cb: Box::new(cb), + } + } + + #[inline] + pub fn prepare_destroy(tablet: EK, region_id: u64, wait_for_persisted: u64) -> Self { + Task::PrepareDestroy { + tablet: Either::Left(tablet), + region_id, + wait_for_persisted, + } + } + + #[inline] + pub fn prepare_destroy_path(path: PathBuf, region_id: u64, wait_for_persisted: u64) -> Self { + Task::PrepareDestroy { + tablet: Either::Right(path), + region_id, + wait_for_persisted, + } + } + + #[inline] + pub fn destroy(region_id: u64, persisted_index: u64) -> Self { + Task::Destroy { + region_id, + persisted_index, + } + } + + #[inline] + pub fn direct_destroy(tablet: EK) -> Self { + Task::DirectDestroy { + tablet: Either::Left(tablet), + } + } + + #[inline] + pub fn direct_destroy_path(path: PathBuf) -> Self { + Task::DirectDestroy { + tablet: Either::Right(path), + } + } +} + +pub struct Runner { + tablet_registry: TabletRegistry, + sst_importer: Arc, + logger: Logger, + + // region_id -> [(tablet_path, wait_for_persisted)]. + waiting_destroy_tasks: HashMap>, + pending_destroy_tasks: Vec, + + // An independent pool to run tasks that are time-consuming but doesn't take CPU resources, + // such as waiting for RocksDB compaction. + background_pool: FuturePool, +} + +impl Runner { + pub fn new( + tablet_registry: TabletRegistry, + sst_importer: Arc, + logger: Logger, + ) -> Self { + Self { + tablet_registry, + sst_importer, + logger, + waiting_destroy_tasks: HashMap::default(), + pending_destroy_tasks: Vec::new(), + background_pool: YatpPoolBuilder::new(DefaultTicker::default()) + .name_prefix("tablet-bg") + .thread_count( + 0, + DEFAULT_BACKGROUND_POOL_SIZE, + DEFAULT_BACKGROUND_POOL_SIZE, + ) + .build_future_pool(), + } + } + + fn trim(&self, tablet: EK, start: Box<[u8]>, end: Box<[u8]>, cb: Box) { + let start_key = keys::data_key(&start); + let end_key = keys::data_end_key(&end); + let range1 = Range::new(&[], &start_key); + let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + if let Err(e) = tablet.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2]) { + error!( + self.logger, + "failed to trim tablet"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + return; + } + let logger = self.logger.clone(); + self.background_pool + .spawn(async move { + let range1 = Range::new(&[], &start_key); + let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + for r in [range1, range2] { + // When compaction filter is present, trivial move is disallowed. + if let Err(e) = + tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1) + { + if e.to_string().contains("Manual compaction paused") { + info!( + logger, + "tablet manual compaction is paused, skip trim"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + } else { + error!( + logger, + "failed to trim tablet"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + } + return; + } + } + // drop before callback. + drop(tablet); + cb(); + }) + .unwrap(); + } + + fn pause_background_work(&mut self, tablet: Either) -> PathBuf { + match tablet { + Either::Left(tablet) => { + // The tablet is about to be deleted, flush is a waste and will block destroy. + let _ = tablet.set_db_options(&[("avoid_flush_during_shutdown", "true")]); + let _ = tablet.pause_background_work(); + PathBuf::from(tablet.path()) + } + Either::Right(path) => path, + } + } + + fn prepare_destroy( + &mut self, + region_id: u64, + tablet: Either, + wait_for_persisted: u64, + ) { + let path = self.pause_background_work(tablet); + self.waiting_destroy_tasks + .entry(region_id) + .or_default() + .push((path, wait_for_persisted)); + } + + fn destroy(&mut self, region_id: u64, persisted: u64) { + if let Some(v) = self.waiting_destroy_tasks.get_mut(®ion_id) { + v.retain(|(path, wait)| { + if *wait <= persisted { + if !Self::process_destroy_task(&self.logger, &self.tablet_registry, path) { + self.pending_destroy_tasks.push(path.clone()); + } + return false; + } + true + }); + } + } + + fn direct_destroy(&mut self, tablet: Either) { + let path = self.pause_background_work(tablet); + if !Self::process_destroy_task(&self.logger, &self.tablet_registry, &path) { + self.pending_destroy_tasks.push(path); + } + } + + /// Returns true if task is consumed. Failure is considered consumed. + fn process_destroy_task(logger: &Logger, registry: &TabletRegistry, path: &Path) -> bool { + match EK::locked(path.to_str().unwrap()) { + Err(e) => warn!( + logger, + "failed to check whether the tablet path is locked"; + "err" => ?e, + "path" => path.display(), + ), + Ok(false) => { + let (_, region_id, tablet_index) = + registry.parse_tablet_name(path).unwrap_or(("", 0, 0)); + // TODO: use a meaningful table context. + let _ = registry + .tablet_factory() + .destroy_tablet( + TabletContext::with_infinite_region(region_id, Some(tablet_index)), + path, + ) + .map_err(|e| { + warn!( + logger, + "failed to destroy tablet"; + "err" => ?e, + "path" => path.display(), + ) + }); + return true; + } + Ok(true) => { + debug!(logger, "ignore locked tablet"; "path" => path.display()); + } + } + false + } + + fn cleanup_ssts(&self, ssts: Box<[SstMeta]>) { + for sst in Vec::from(ssts) { + if let Err(e) = self.sst_importer.delete(&sst) { + warn!(self.logger, "failed to cleanup sst"; "err" => ?e, "sst" => ?sst); + } + } + } + + fn flush_tablet(&self, region_id: u64, cb: Option>) { + let Some(Some(tablet)) = self + .tablet_registry + .get(region_id) + .map(|mut cache| cache.latest().cloned()) else {return}; + + // The callback `cb` being some means it's the task sent from + // leader, we should sync flush memtables and call it after the flush complete + // where the split will be proposed again with extra flag. + if let Some(cb) = cb { + let logger = self.logger.clone(); + let now = Instant::now(); + self.background_pool + .spawn(async move { + // sync flush for leader to let the flush happend before later checkpoint. + tablet.flush_cfs(DATA_CFS, true).unwrap(); + let elapsed = now.saturating_elapsed(); + // to be removed after when it's stable + info!( + logger, + "flush memtable for leader"; + "region_id" => region_id, + "duration" => ?elapsed, + ); + + drop(tablet); + cb(); + }) + .unwrap(); + } else { + info!( + self.logger, + "flush memtable for follower"; + "region_id" => region_id, + ); + + tablet.flush_cfs(DATA_CFS, false).unwrap(); + } + } +} + +impl Runnable for Runner +where + EK: KvEngine, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + match task { + Task::Trim { + tablet, + start_key, + end_key, + cb, + } => self.trim(tablet, start_key, end_key, cb), + Task::PrepareDestroy { + region_id, + tablet, + wait_for_persisted, + } => self.prepare_destroy(region_id, tablet, wait_for_persisted), + Task::Destroy { + region_id, + persisted_index, + } => self.destroy(region_id, persisted_index), + Task::DirectDestroy { tablet, .. } => self.direct_destroy(tablet), + Task::CleanupImportSst(ssts) => self.cleanup_ssts(ssts), + Task::Flush { region_id, cb } => self.flush_tablet(region_id, cb), + } + } +} + +impl RunnableWithTimer for Runner +where + EK: KvEngine, +{ + fn on_timeout(&mut self) { + self.pending_destroy_tasks + .retain(|task| !Self::process_destroy_task(&self.logger, &self.tablet_registry, task)); + } + + fn get_interval(&self) -> Duration { + Duration::from_secs(10) + } +} + +#[cfg(test)] +mod tests { + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::TestTabletFactory, + }; + use engine_traits::{MiscExt, TabletContext, TabletRegistry}; + use tempfile::Builder; + + use super::*; + use crate::operation::test_util::create_tmp_importer; + + #[test] + fn test_race_between_destroy_and_trim() { + let dir = Builder::new() + .prefix("test_race_between_destroy_and_trim") + .tempdir() + .unwrap(); + let factory = Box::new(TestTabletFactory::new( + DbOptions::default(), + vec![("default", CfOptions::default())], + )); + let registry = TabletRegistry::new(factory, dir.path()).unwrap(); + let logger = slog_global::borrow_global().new(slog::o!()); + let (_dir, importer) = create_tmp_importer(); + let mut runner = Runner::new(registry.clone(), importer, logger); + + let mut region = Region::default(); + let rid = 1; + region.set_id(rid); + region.set_start_key(b"a".to_vec()); + region.set_end_key(b"b".to_vec()); + let tablet = registry + .load(TabletContext::new(®ion, Some(1)), true) + .unwrap() + .latest() + .unwrap() + .clone(); + runner.run(Task::prepare_destroy(tablet.clone(), rid, 10)); + let (tx, rx) = std::sync::mpsc::channel(); + runner.run(Task::trim(tablet, ®ion, move || tx.send(()).unwrap())); + rx.recv().unwrap(); + + let rid = 2; + region.set_id(rid); + region.set_start_key(b"c".to_vec()); + region.set_end_key(b"d".to_vec()); + let tablet = registry + .load(TabletContext::new(®ion, Some(1)), true) + .unwrap() + .latest() + .unwrap() + .clone(); + registry.remove(rid); + runner.run(Task::prepare_destroy(tablet.clone(), rid, 10)); + runner.run(Task::destroy(rid, 100)); + let path = PathBuf::from(tablet.path()); + assert!(path.exists()); + let (tx, rx) = std::sync::mpsc::channel(); + runner.run(Task::trim(tablet, ®ion, move || tx.send(()).unwrap())); + rx.recv().unwrap(); + runner.on_timeout(); + assert!(!path.exists()); + } +} diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs new file mode 100644 index 00000000000..6148cb4eae1 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(test)] +#![feature(assert_matches)] +#![feature(custom_test_frameworks)] +#![test_runner(test_util::run_failpoint_tests)] + +#[allow(dead_code)] +#[path = "../integrations/cluster.rs"] +mod cluster; +mod test_basic_write; +mod test_bootstrap; +mod test_bucket; +mod test_life; +mod test_merge; +mod test_split; +mod test_trace_apply; diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs new file mode 100644 index 00000000000..55d85b90fa4 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -0,0 +1,95 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{assert_matches::assert_matches, time::Duration}; + +use engine_traits::{Peekable, CF_DEFAULT}; +use futures::executor::block_on; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + +use crate::cluster::Cluster; + +/// Check if write batch is correctly maintained during apply. +#[test] +fn test_write_batch_rollback() { + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + // Make several entries to batch in apply thread. + fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); + + // Good proposal should be committed. + let (msg, mut sub0) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub0.wait_proposed())); + assert!(block_on(sub0.wait_committed())); + + // If the write batch is correctly initialized, next write should not contain + // last result. + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key1", b"value"); + let (msg, mut sub1) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub1.wait_proposed())); + assert!(block_on(sub1.wait_committed())); + + fail::cfg("APPLY_PUT", "1*return()").unwrap(); + // Wake up and sleep in next committed entry. + fail::remove("APPLY_COMMITTED_ENTRIES"); + // First apply will fail due to aborted. If write batch is initialized + // correctly, correct response can be returned. + let resp = block_on(sub0.result()).unwrap(); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("aborted"), + "{:?}", + resp + ); + let resp = block_on(sub1.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key"), Ok(None)); + assert_eq!(snap.get_value(b"key1").unwrap().unwrap(), b"value"); + + fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); + + // Trigger error again, so an initialized write batch should be rolled back. + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key2", b"value"); + let (msg, mut sub0) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub0.wait_proposed())); + assert!(block_on(sub0.wait_committed())); + + // If the write batch is correctly rollbacked, next write should not contain + // last result. + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key3", b"value"); + let (msg, mut sub1) = PeerMsg::simple_write(header, put.encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub1.wait_proposed())); + assert!(block_on(sub1.wait_committed())); + + fail::cfg("APPLY_PUT", "1*return()").unwrap(); + fail::remove("APPLY_COMMITTED_ENTRIES"); + let resp = block_on(sub0.result()).unwrap(); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("aborted"), + "{:?}", + resp + ); + let resp = block_on(sub1.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key2"), Ok(None)); + assert_eq!(snap.get_value(b"key3").unwrap().unwrap(), b"value"); +} diff --git a/components/raftstore-v2/tests/failpoints/test_bootstrap.rs b/components/raftstore-v2/tests/failpoints/test_bootstrap.rs new file mode 100644 index 00000000000..f56078a59f5 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_bootstrap.rs @@ -0,0 +1,61 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::assert_matches::assert_matches; + +use engine_traits::RaftEngineReadOnly; +use kvproto::metapb::Store; +use raftstore_v2::Bootstrap; +use slog::o; +use tempfile::TempDir; + +#[test] +fn test_bootstrap_half_way_failure() { + let server = test_pd::Server::new(1); + let eps = server.bind_addrs(); + let pd_client = test_pd::util::new_client(eps, None); + let path = TempDir::new().unwrap(); + let engines = engine_test::new_temp_engine(&path); + let bootstrap = || { + let logger = slog_global::borrow_global().new(o!()); + let mut bootstrap = Bootstrap::new(&engines.raft, 0, &pd_client, logger); + match bootstrap.bootstrap_store() { + Ok(store_id) => { + let mut store = Store::default(); + store.set_id(store_id); + bootstrap.bootstrap_first_region(&store, store_id) + } + Err(e) => Err(e), + } + }; + + // Try to start this node, return after persisted some keys. + fail::cfg("node_after_bootstrap_store", "return").unwrap(); + let s = format!("{}", bootstrap().unwrap_err()); + assert!(s.contains("node_after_bootstrap_store"), "{}", s); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(None)); + + let ident = engines.raft.get_store_ident().unwrap().unwrap(); + assert_ne!(ident.get_store_id(), 0); + + // Check whether it can bootstrap cluster successfully. + fail::remove("node_after_bootstrap_store"); + fail::cfg("node_after_prepare_bootstrap_cluster", "return").unwrap(); + let s = format!("{}", bootstrap().unwrap_err()); + assert!(s.contains("node_after_prepare_bootstrap_cluster"), "{}", s); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(Some(_))); + + fail::remove("node_after_prepare_bootstrap_cluster"); + fail::cfg("node_after_bootstrap_cluster", "return").unwrap(); + let s = format!("{}", bootstrap().unwrap_err()); + assert!(s.contains("node_after_bootstrap_cluster"), "{}", s); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(Some(_))); + + // Although aborted by error, rebootstrap should continue. + bootstrap().unwrap().unwrap(); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(None)); + + // Second bootstrap should be noop. + assert_eq!(bootstrap().unwrap(), None); + + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(None)); +} diff --git a/components/raftstore-v2/tests/failpoints/test_bucket.rs b/components/raftstore-v2/tests/failpoints/test_bucket.rs new file mode 100644 index 00000000000..f136cf6dc53 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_bucket.rs @@ -0,0 +1,58 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::RaftEngineReadOnly; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use tikv_util::store::new_peer; + +use crate::cluster::{split_helper::split_region_and_refresh_bucket, Cluster}; + +/// Test refresh bucket. +#[test] +fn test_refresh_bucket() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let region_2 = 2; + let region = router.region_detail(region_2); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(region_2, Duration::from_secs(3)); + + // Region 2 ["", ""] + // -> Region 2 ["", "k22"] + // Region 1000 ["k22", ""] peer(1, 10) + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + + // to simulate the delay of set_apply_scheduler + fail::cfg("delay_set_apply_scheduler", "sleep(1000)").unwrap(); + split_region_and_refresh_bucket( + router, + region, + peer, + 1000, + new_peer(store_id, 10), + b"k22", + false, + ); + + for _i in 1..100 { + std::thread::sleep(Duration::from_millis(50)); + let meta = router + .must_query_debug_info(1000, Duration::from_secs(1)) + .unwrap(); + if !meta.bucket_keys.is_empty() { + assert_eq!(meta.bucket_keys.len(), 4); // include region start/end keys + assert_eq!(meta.bucket_keys[1], b"1".to_vec()); + assert_eq!(meta.bucket_keys[2], b"2".to_vec()); + return; + } + } + panic!("timeout for updating buckets"); // timeout +} diff --git a/components/raftstore-v2/tests/failpoints/test_life.rs b/components/raftstore-v2/tests/failpoints/test_life.rs new file mode 100644 index 00000000000..ed05c1c6fad --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_life.rs @@ -0,0 +1,67 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::CF_DEFAULT; +use futures::executor::block_on; +use kvproto::raft_serverpb::RaftMessage; +use raft::prelude::MessageType; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use tikv_util::store::new_peer; + +use crate::cluster::{life_helper::assert_peer_not_exist, Cluster}; + +/// Test if a peer can be destroyed when it's applying entries +#[test] +fn test_destroy_by_larger_id_while_applying() { + let fp = "APPLY_COMMITTED_ENTRIES"; + let mut cluster = Cluster::default(); + let router = &cluster.routers[0]; + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + fail::cfg(fp, "pause").unwrap(); + + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_committed())); + + let mut larger_id_msg = Box::::default(); + larger_id_msg.set_region_id(2); + let mut target_peer = header.get_peer().clone(); + target_peer.set_id(target_peer.get_id() + 1); + larger_id_msg.set_to_peer(target_peer.clone()); + larger_id_msg.set_region_epoch(header.get_region_epoch().clone()); + larger_id_msg + .mut_region_epoch() + .set_conf_ver(header.get_region_epoch().get_conf_ver() + 1); + larger_id_msg.set_from_peer(new_peer(2, 8)); + let raft_message = larger_id_msg.mut_message(); + raft_message.set_msg_type(MessageType::MsgHeartbeat); + raft_message.set_from(8); + raft_message.set_to(target_peer.get_id()); + raft_message.set_term(10); + + // Larger ID should trigger destroy. + router.send_raft_message(larger_id_msg).unwrap(); + fail::remove(fp); + assert_peer_not_exist(2, header.get_peer().get_id(), router); + let meta = router + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, target_peer.get_id()); + assert_eq!(meta.raft_status.hard_state.term, 10); + + std::thread::sleep(Duration::from_millis(10)); + + // New peer should survive restart. + cluster.restart(0); + let router = &cluster.routers[0]; + let meta = router + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, target_peer.get_id()); + assert_eq!(meta.raft_status.hard_state.term, 10); +} diff --git a/components/raftstore-v2/tests/failpoints/test_merge.rs b/components/raftstore-v2/tests/failpoints/test_merge.rs new file mode 100644 index 00000000000..d660221d5ee --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_merge.rs @@ -0,0 +1,109 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::Peekable; +use tikv_util::store::new_peer; + +use crate::cluster::{ + life_helper::assert_peer_not_exist, merge_helper::merge_region, split_helper::split_region, + Cluster, +}; + +#[test] +fn test_source_and_target_both_replay() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let router = &mut cluster.routers[0]; + + let region_1 = router.region_detail(2); + let peer_1 = region_1.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let peer_2 = new_peer(store_id, peer_1.get_id() + 1); + let region_1_id = region_1.get_id(); + let region_2_id = region_1_id + 1; + let (region_1, region_2) = split_region( + router, + region_1, + peer_1.clone(), + region_2_id, + peer_2, + Some(format!("k{}k", region_1_id).as_bytes()), + Some(format!("k{}k", region_2_id).as_bytes()), + format!("k{}", region_2_id).as_bytes(), + format!("k{}", region_2_id).as_bytes(), + false, + ); + + { + let _fp = fail::FailGuard::new("after_acquire_source_checkpoint", "1*return->off"); + merge_region(&cluster, 0, region_1, peer_1, region_2, false); + } + + cluster.restart(0); + let router = &mut cluster.routers[0]; + // Wait for replay. + let mut retry = 0; + while retry < 50 { + // Read region 1 data from region 2. + let snapshot = router.stale_snapshot(region_2_id); + let key = format!("k{region_1_id}k"); + if let Ok(Some(_)) = snapshot.get_value(key.as_bytes()) { + return; + } + retry += 1; + std::thread::sleep(Duration::from_millis(100)); + } + panic!("merge not replayed after 5s"); +} + +#[test] +fn test_source_destroy_before_target_apply() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let router = &mut cluster.routers[0]; + + let region_1 = router.region_detail(2); + let peer_1 = region_1.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let peer_2 = new_peer(store_id, peer_1.get_id() + 1); + let region_1_id = region_1.get_id(); + let region_2_id = region_1_id + 1; + let (region_1, region_2) = split_region( + router, + region_1, + peer_1.clone(), + region_2_id, + peer_2, + Some(format!("k{}k", region_1_id).as_bytes()), + Some(format!("k{}k", region_2_id).as_bytes()), + format!("k{}", region_2_id).as_bytes(), + format!("k{}", region_2_id).as_bytes(), + false, + ); + + { + // Sending CatchUpLogs will make source destroy early (without waiting for + // AckCommitMerge). + let _fp1 = fail::FailGuard::new("force_send_catch_up_logs", "1*return->off"); + let _fp2 = fail::FailGuard::new("after_acquire_source_checkpoint", "1*return->off"); + merge_region(&cluster, 0, region_1, peer_1.clone(), region_2, false); + } + assert_peer_not_exist(region_1_id, peer_1.get_id(), &cluster.routers[0]); + + cluster.restart(0); + let router = &mut cluster.routers[0]; + // Wait for replay. + let mut retry = 0; + while retry < 50 { + // Read region 1 data from region 2. + let snapshot = router.stale_snapshot(region_2_id); + let key = format!("k{region_1_id}k"); + if let Ok(Some(_)) = snapshot.get_value(key.as_bytes()) { + return; + } + retry += 1; + std::thread::sleep(Duration::from_millis(100)); + } + panic!("merge not replayed after 5s"); +} diff --git a/components/raftstore-v2/tests/failpoints/test_split.rs b/components/raftstore-v2/tests/failpoints/test_split.rs new file mode 100644 index 00000000000..e67041ab181 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_split.rs @@ -0,0 +1,109 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + thread, + time::{Duration, Instant}, +}; + +use engine_traits::{RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + +use crate::cluster::{split_helper::split_region, Cluster}; + +/// If a node is restarted after metadata is persisted before tablet is not +/// installed, it should resume install the tablet. +#[test] +fn test_restart_resume() { + let mut cluster = Cluster::default(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let region_id = 2; + let region = router.region_detail(region_id); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + let fp = "async_write_before_cb"; + fail::cfg(fp, "return").unwrap(); + + let split_region_id = 1000; + let mut new_peer = peer.clone(); + new_peer.set_id(1001); + split_region( + router, + region, + peer, + split_region_id, + new_peer, + None, + None, + b"k11", + b"k11", + true, + ); + + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"k22", b"value"); + let header = Box::new(router.new_request_for(region_id).take_header()); + let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + // Send a command to ensure split init is triggered. + block_on(sub.wait_proposed()); + + let region_state = raft_engine + .get_region_state(split_region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + let path = cluster + .node(0) + .tablet_registry() + .tablet_path(split_region_id, RAFT_INIT_LOG_INDEX); + assert!(!path.exists(), "{} should not exist", path.display()); + drop(raft_engine); + + cluster.restart(0); + // If split is resumed, the tablet should be installed. + assert!( + path.exists(), + "{} should exist after restart", + path.display() + ); + + // Both region should be recovered correctly. + let cases = vec![ + (split_region_id, b"k01", b"v01"), + (region_id, b"k21", b"v21"), + ]; + let router = &mut cluster.routers[0]; + let new_epoch = router + .new_request_for(split_region_id) + .take_header() + .take_region_epoch(); + // Split will be resumed for region 2, not removing the fp will make write block + // forever. + fail::remove(fp); + let timer = Instant::now(); + for (region_id, key, val) in cases { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let mut header = Box::new(router.new_request_for(region_id).take_header()); + while timer.elapsed() < Duration::from_secs(3) { + // We need to wait till source peer replay split. + if *header.get_region_epoch() != new_epoch { + thread::sleep(Duration::from_millis(100)); + header = Box::new(router.new_request_for(region_id).take_header()); + continue; + } + break; + } + assert_eq!(*header.get_region_epoch(), new_epoch, "{:?}", header); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + // Send a command to ensure split init is triggered. + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + } +} diff --git a/components/raftstore-v2/tests/failpoints/test_trace_apply.rs b/components/raftstore-v2/tests/failpoints/test_trace_apply.rs new file mode 100644 index 00000000000..15bf39d17ba --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_trace_apply.rs @@ -0,0 +1,7 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// TODO: check if it can recover from: +// - split not start +// - split not finish +// - two pending split the second one finished before the first one +// - all split finish diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs new file mode 100644 index 00000000000..83cf3646b9b --- /dev/null +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -0,0 +1,972 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + ops::{Deref, DerefMut}, + path::Path, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + thread, + time::{Duration, Instant}, +}; + +use causal_ts::CausalTsProviderImpl; +use collections::HashSet; +use concurrency_manager::ConcurrencyManager; +use crossbeam::channel::{self, Receiver, Sender, TrySendError}; +use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, KvTestSnapshot, TestTabletFactory}, + raft::RaftTestEngine, +}; +use engine_traits::{TabletContext, TabletRegistry, DATA_CFS}; +use futures::executor::block_on; +use kvproto::{ + kvrpcpb::ApiVersion, + metapb::{self, RegionEpoch, Store}, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request}, + raft_serverpb::RaftMessage, +}; +use pd_client::RpcClient; +use raft::eraftpb::MessageType; +use raftstore::{ + coprocessor::{Config as CopConfig, CoprocessorHost, StoreHandle}, + store::{ + region_meta::{RegionLocalState, RegionMeta}, + AutoSplitController, Bucket, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, + Transport, RAFT_INIT_LOG_INDEX, + }, +}; +use raftstore_v2::{ + create_store_batch_system, + router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter, StoreMsg}, + Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, +}; +use resource_metering::CollectorRegHandle; +use slog::{debug, o, Logger}; +use sst_importer::SstImporter; +use tempfile::TempDir; +use test_pd::mocker::Service; +use tikv_util::{ + config::{ReadableDuration, ReadableSize, VersionTrack}, + store::new_peer, + worker::{LazyWorker, Worker}, +}; +use txn_types::WriteBatchFlags; + +pub fn check_skip_wal(path: &str) { + let mut found = false; + for f in std::fs::read_dir(path).unwrap() { + let e = f.unwrap(); + if e.path().extension().map_or(false, |ext| ext == "log") { + found = true; + assert_eq!(e.metadata().unwrap().len(), 0, "{}", e.path().display()); + } + } + assert!(found, "no WAL found in {}", path); +} + +#[derive(Clone)] +pub struct TestRouter(RaftRouter); + +impl Deref for TestRouter { + type Target = RaftRouter; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TestRouter { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl TestRouter { + pub fn query(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::raft_query(req); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn must_query_debug_info(&self, region_id: u64, timeout: Duration) -> Option { + let timer = Instant::now(); + while timer.elapsed() < timeout { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + let res = self.send(region_id, msg); + if res.is_err() { + thread::sleep(Duration::from_millis(10)); + continue; + } + let res = block_on(sub.result()); + if res.is_some() { + return res; + } + } + None + } + + pub fn simple_write( + &self, + region_id: u64, + header: Box, + write: SimpleWriteEncoder, + ) -> Option { + let (msg, sub) = PeerMsg::simple_write(header, write.encode()); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn admin_command(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::admin_command(req); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn wait_flush(&self, region_id: u64, timeout: Duration) -> bool { + let timer = Instant::now(); + while timer.elapsed() < timeout { + let (ch, sub) = FlushChannel::pair(); + let res = self.send(region_id, PeerMsg::WaitFlush(ch)); + match res { + Ok(_) => return block_on(sub.result()).is_some(), + Err(TrySendError::Disconnected(m)) => { + let PeerMsg::WaitFlush(ch) = m else { unreachable!() }; + match self + .store_router() + .send_control(StoreMsg::WaitFlush { region_id, ch }) + { + Ok(_) => return block_on(sub.result()).is_some(), + Err(_) => return false, + } + } + Err(TrySendError::Full(_)) => thread::sleep(Duration::from_millis(10)), + } + } + panic!("unable to flush {}", region_id); + } + + pub fn wait_applied_to_current_term(&self, region_id: u64, timeout: Duration) { + let mut now = Instant::now(); + let deadline = now + timeout; + let mut res = None; + while now < deadline { + res = self.must_query_debug_info(region_id, deadline - now); + if let Some(info) = &res { + // If term matches and apply to commit index, then it must apply to current + // term. + if info.raft_apply.applied_index == info.raft_apply.commit_index + && info.raft_apply.commit_term == info.raft_status.hard_state.term + { + return; + } + } + thread::sleep(Duration::from_millis(10)); + now = Instant::now(); + } + panic!( + "region {} is not applied to current term, {:?}", + region_id, res + ); + } + + pub fn new_request_for(&self, region_id: u64) -> RaftCmdRequest { + let meta = self + .must_query_debug_info(region_id, Duration::from_secs(1)) + .unwrap(); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + let epoch = req.mut_header().mut_region_epoch(); + let epoch_meta = &meta.region_state.epoch; + epoch.set_version(epoch_meta.version); + epoch.set_conf_ver(epoch_meta.conf_ver); + let target_peer = *meta + .region_state + .peers + .iter() + .find(|p| p.id == meta.raft_status.id) + .unwrap(); + let mut peer = new_peer(target_peer.store_id, target_peer.id); + peer.role = target_peer.role.into(); + req.mut_header().set_peer(peer); + req.mut_header().set_term(meta.raft_status.hard_state.term); + req + } + + pub fn stale_snapshot(&mut self, region_id: u64) -> RegionSnapshot { + let mut req = self.new_request_for(region_id); + let header = req.mut_header(); + header.set_flags(WriteBatchFlags::STALE_READ.bits()); + header.set_flag_data(vec![0; 8]); + let mut snap_req = Request::default(); + snap_req.set_cmd_type(CmdType::Snap); + req.mut_requests().push(snap_req); + block_on(self.snapshot(req)).unwrap() + } + + pub fn region_detail(&self, region_id: u64) -> metapb::Region { + let RegionLocalState { + id, + start_key, + end_key, + epoch, + peers, + .. + } = self + .must_query_debug_info(region_id, Duration::from_secs(1)) + .unwrap() + .region_state; + let mut region = metapb::Region::default(); + region.set_id(id); + region.set_start_key(start_key); + region.set_end_key(end_key); + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(epoch.conf_ver); + region_epoch.set_version(epoch.version); + region.set_region_epoch(region_epoch); + for peer in peers { + region.mut_peers().push(new_peer(peer.store_id, peer.id)); + } + region + } + + pub fn refresh_bucket(&self, region_id: u64, region_epoch: RegionEpoch, buckets: Vec) { + self.store_router() + .refresh_region_buckets(region_id, region_epoch, buckets, None); + } +} + +pub struct RunningState { + store_id: u64, + pub raft_engine: RaftTestEngine, + pub registry: TabletRegistry, + pub system: StoreSystem, + pub cfg: Arc>, + pub cop_cfg: Arc>, + pub transport: TestTransport, + snap_mgr: TabletSnapManager, + background: Worker, +} + +impl RunningState { + fn new( + pd_client: &Arc, + path: &Path, + cfg: Arc>, + cop_cfg: Arc>, + transport: TestTransport, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, + logger: &Logger, + ) -> (TestRouter, Self) { + // TODO(tabokie): Enable encryption by default. (after snapshot encryption) + // let encryption_cfg = test_util::new_file_security_config(path); + // let key_manager = Some(Arc::new( + // data_key_manager_from_config(&encryption_cfg, path.to_str().unwrap()) + // .unwrap() + // .unwrap(), + // )); + let key_manager = None; + + let mut opts = engine_test::ctor::RaftDbOptions::default(); + opts.set_key_manager(key_manager.clone()); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), Some(opts)) + .unwrap(); + + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client.as_ref(), logger.clone()); + let store_id = bootstrap.bootstrap_store().unwrap(); + let mut store = Store::default(); + store.set_id(store_id); + + let (router, mut system) = create_store_batch_system::( + &cfg.value(), + store_id, + logger.clone(), + ); + let cf_opts = DATA_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let mut db_opt = DbOptions::default(); + db_opt.set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); + db_opt.set_key_manager(key_manager.clone()); + let factory = Box::new(TestTabletFactory::new(db_opt, cf_opts)); + let registry = TabletRegistry::new(factory, path.join("tablets")).unwrap(); + if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { + let factory = registry.tablet_factory(); + let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); + if factory.exists(&path) { + registry.remove(region.get_id()); + factory.destroy_tablet(ctx.clone(), &path).unwrap(); + } + // Create the tablet without loading it in cache. + factory.open_tablet(ctx, &path).unwrap(); + } + + let router = RaftRouter::new(store_id, router); + let store_meta = router.store_meta().clone(); + let snap_mgr = TabletSnapManager::new( + path.join("tablets_snap").to_str().unwrap(), + key_manager.clone(), + ) + .unwrap(); + let coprocessor_host = + CoprocessorHost::new(router.store_router().clone(), cop_cfg.value().clone()); + let importer = Arc::new( + SstImporter::new( + &Default::default(), + path.join("importer"), + key_manager.clone(), + ApiVersion::V1, + ) + .unwrap(), + ); + + let background = Worker::new("background"); + let pd_worker = LazyWorker::new("pd-worker"); + system + .start( + store_id, + cfg.clone(), + raft_engine.clone(), + registry.clone(), + transport.clone(), + pd_client.clone(), + router.store_router(), + store_meta, + snap_mgr.clone(), + concurrency_manager, + causal_ts_provider, + coprocessor_host, + AutoSplitController::default(), + CollectorRegHandle::new_for_test(), + background.clone(), + pd_worker, + importer, + key_manager, + ) + .unwrap(); + + let state = Self { + store_id, + raft_engine, + registry, + system, + cfg, + transport, + snap_mgr, + background, + cop_cfg, + }; + (TestRouter(router), state) + } +} + +impl Drop for RunningState { + fn drop(&mut self) { + self.system.shutdown(); + self.background.stop(); + } +} + +pub struct TestNode { + pd_client: Arc, + path: TempDir, + running_state: Option, + logger: Logger, +} + +impl TestNode { + fn with_pd(pd_server: &test_pd::Server, logger: Logger) -> TestNode { + let pd_client = Arc::new(test_pd::util::new_client(pd_server.bind_addrs(), None)); + let path = TempDir::new().unwrap(); + TestNode { + pd_client, + path, + running_state: None, + logger, + } + } + + fn start( + &mut self, + cfg: Arc>, + cop_cfg: Arc>, + trans: TestTransport, + ) -> TestRouter { + let (router, state) = RunningState::new( + &self.pd_client, + self.path.path(), + cfg, + cop_cfg, + trans, + ConcurrencyManager::new(1.into()), + None, + &self.logger, + ); + self.running_state = Some(state); + router + } + + #[allow(dead_code)] + pub fn tablet_registry(&self) -> &TabletRegistry { + &self.running_state().unwrap().registry + } + + pub fn pd_client(&self) -> &Arc { + &self.pd_client + } + + fn stop(&mut self) { + self.running_state.take(); + } + + fn restart(&mut self) -> TestRouter { + let state = self.running_state().unwrap(); + let prev_transport = state.transport.clone(); + let cfg = state.cfg.clone(); + let cop_cfg = state.cop_cfg.clone(); + self.stop(); + self.start(cfg, cop_cfg, prev_transport) + } + + pub fn running_state(&self) -> Option<&RunningState> { + self.running_state.as_ref() + } + + pub fn id(&self) -> u64 { + self.running_state().unwrap().store_id + } +} + +impl Drop for TestNode { + fn drop(&mut self) { + self.stop(); + } +} + +#[derive(Clone)] +pub struct TestTransport { + tx: Sender, + flush_cnt: Arc, +} + +pub fn new_test_transport() -> (TestTransport, Receiver) { + let (tx, rx) = channel::unbounded(); + let flush_cnt = Default::default(); + (TestTransport { tx, flush_cnt }, rx) +} + +impl Transport for TestTransport { + fn send(&mut self, msg: RaftMessage) -> raftstore_v2::Result<()> { + let _ = self.tx.send(msg); + Ok(()) + } + + fn set_store_allowlist(&mut self, _stores: Vec) {} + + fn need_flush(&self) -> bool { + !self.tx.is_empty() + } + + fn flush(&mut self) { + self.flush_cnt.fetch_add(1, Ordering::SeqCst); + } +} + +// TODO: remove following when we finally integrate it in tikv-server binary. +pub fn v2_default_config() -> Config { + let mut config = Config::default(); + config.store_io_pool_size = 1; + if config.region_split_check_diff.is_none() { + config.region_split_check_diff = Some(ReadableSize::mb(96 / 16)); + } + config +} + +/// Disable all ticks, so test case can schedule manually. +pub fn disable_all_auto_ticks(cfg: &mut Config) { + cfg.raft_base_tick_interval = ReadableDuration::ZERO; + cfg.raft_log_gc_tick_interval = ReadableDuration::ZERO; + cfg.raft_log_compact_sync_interval = ReadableDuration::ZERO; + cfg.raft_engine_purge_interval = ReadableDuration::ZERO; + cfg.split_region_check_tick_interval = ReadableDuration::ZERO; + cfg.region_compact_check_interval = ReadableDuration::ZERO; + cfg.pd_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.pd_store_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.snap_mgr_gc_tick_interval = ReadableDuration::ZERO; + cfg.lock_cf_compact_interval = ReadableDuration::ZERO; + cfg.peer_stale_state_check_interval = ReadableDuration::ZERO; + cfg.consistency_check_interval = ReadableDuration::ZERO; + cfg.report_region_flow_interval = ReadableDuration::ZERO; + cfg.check_leader_lease_interval = ReadableDuration::ZERO; + cfg.merge_check_tick_interval = ReadableDuration::ZERO; + cfg.cleanup_import_sst_interval = ReadableDuration::ZERO; + cfg.inspect_interval = ReadableDuration::ZERO; + cfg.report_min_resolved_ts_interval = ReadableDuration::ZERO; + cfg.reactive_memory_lock_tick_interval = ReadableDuration::ZERO; + cfg.report_region_buckets_tick_interval = ReadableDuration::ZERO; + cfg.check_long_uncommitted_interval = ReadableDuration::ZERO; +} + +pub struct Cluster { + pd_server: test_pd::Server, + nodes: Vec, + receivers: Vec>, + pub routers: Vec, + logger: Logger, +} + +impl Default for Cluster { + fn default() -> Cluster { + Cluster::with_node_count(1, None) + } +} + +impl Cluster { + pub fn with_config(config: Config) -> Cluster { + Cluster::with_node_count(1, Some(config)) + } + + pub fn with_node_count(count: usize, config: Option) -> Self { + Cluster::with_configs(count, config, None) + } + + pub fn with_cop_cfg(config: Option, coprocessor_cfg: CopConfig) -> Cluster { + Cluster::with_configs(1, config, Some(coprocessor_cfg)) + } + + pub fn with_configs(count: usize, config: Option, cop_cfg: Option) -> Self { + let pd_server = test_pd::Server::new(1); + let logger = slog_global::borrow_global().new(o!()); + let mut cluster = Cluster { + pd_server, + nodes: vec![], + receivers: vec![], + routers: vec![], + logger, + }; + let mut cfg = if let Some(config) = config { + config + } else { + v2_default_config() + }; + disable_all_auto_ticks(&mut cfg); + let cop_cfg = cop_cfg.unwrap_or_default(); + for _ in 1..=count { + let mut node = TestNode::with_pd(&cluster.pd_server, cluster.logger.clone()); + let (tx, rx) = new_test_transport(); + let router = node.start( + Arc::new(VersionTrack::new(cfg.clone())), + Arc::new(VersionTrack::new(cop_cfg.clone())), + tx, + ); + cluster.nodes.push(node); + cluster.receivers.push(rx); + cluster.routers.push(router); + } + cluster + } + + pub fn restart(&mut self, offset: usize) { + self.routers.remove(offset); + let router = self.nodes[offset].restart(); + self.routers.insert(offset, router); + } + + pub fn node(&self, offset: usize) -> &TestNode { + &self.nodes[offset] + } + + pub fn receiver(&self, offset: usize) -> &Receiver { + &self.receivers[offset] + } + + /// Send messages and wait for side effects are all handled. + #[allow(clippy::vec_box)] + pub fn dispatch(&self, region_id: u64, mut msgs: Vec>) { + let mut regions = HashSet::default(); + regions.insert(region_id); + loop { + for msg in msgs.drain(..) { + let offset = match self + .nodes + .iter() + .position(|n| n.id() == msg.get_to_peer().get_store_id()) + { + Some(offset) => offset, + None => { + debug!(self.logger, "failed to find node"; "message" => ?msg); + continue; + } + }; + // Simulate already received the snapshot. + if msg.get_message().get_msg_type() == MessageType::MsgSnapshot { + let from_offset = match self + .nodes + .iter() + .position(|n| n.id() == msg.get_from_peer().get_store_id()) + { + Some(offset) => offset, + None => { + debug!(self.logger, "failed to find snapshot source node"; "message" => ?msg); + continue; + } + }; + let key = TabletSnapKey::new( + region_id, + msg.get_to_peer().get_id(), + msg.get_message().get_snapshot().get_metadata().get_term(), + msg.get_message().get_snapshot().get_metadata().get_index(), + ); + let from_snap_mgr = &self.node(from_offset).running_state().unwrap().snap_mgr; + let to_snap_mgr = &self.node(offset).running_state().unwrap().snap_mgr; + let gen_path = from_snap_mgr.tablet_gen_path(&key); + let recv_path = to_snap_mgr.final_recv_path(&key); + assert!(gen_path.exists()); + std::fs::rename(gen_path, recv_path.clone()).unwrap(); + assert!(recv_path.exists()); + } + regions.insert(msg.get_region_id()); + if let Err(e) = self.routers[offset].send_raft_message(msg) { + debug!(self.logger, "failed to send raft message"; "err" => ?e); + } + } + for (router, rx) in self.routers.iter().zip(&self.receivers) { + for region_id in ®ions { + router.wait_flush(*region_id, Duration::from_secs(3)); + } + while let Ok(msg) = rx.try_recv() { + msgs.push(Box::new(msg)); + } + } + regions.clear(); + if msgs.is_empty() { + return; + } + } + } +} + +impl Drop for Cluster { + fn drop(&mut self) { + self.routers.clear(); + for node in &mut self.nodes { + node.stop(); + } + } +} + +pub mod split_helper { + use std::{thread, time::Duration}; + + use engine_traits::CF_DEFAULT; + use futures::executor::block_on; + use kvproto::{ + metapb, pdpb, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest, RaftCmdResponse, SplitRequest}, + }; + use raftstore::store::Bucket; + use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + + use super::TestRouter; + + pub fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req + } + + pub fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { + let (msg, sub) = PeerMsg::admin_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + // TODO: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); + } + + pub fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, b"v1"); + router.simple_write(region_id, header, put).unwrap() + } + + // Split the region according to the parameters + // return the updated original region + pub fn split_region<'a>( + router: &'a mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + left_key: Option<&'a [u8]>, + right_key: Option<&'a [u8]>, + propose_key: &[u8], + split_key: &[u8], + right_derive: bool, + ) -> (metapb::Region, metapb::Region) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = new_batch_split_region_request( + vec![propose_key.to_vec()], + vec![split_id], + right_derive, + ); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + must_split(region_id, req, router); + + let (left, right) = if !right_derive { + ( + router.region_detail(region_id), + router.region_detail(split_region_id), + ) + } else { + ( + router.region_detail(split_region_id), + router.region_detail(region_id), + ) + }; + + if let Some(right_key) = right_key { + let resp = put(router, left.id, right_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, right_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + } + if let Some(left_key) = left_key { + let resp = put(router, left.id, left_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, left_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + } + + assert_eq!(left.get_end_key(), split_key); + assert_eq!(right.get_start_key(), split_key); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + + (left, right) + } + + // Split the region and refresh bucket immediately + // This is to simulate the case when the splitted peer's storage is not + // initialized yet when refresh bucket happens + pub fn split_region_and_refresh_bucket( + router: &mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + propose_key: &[u8], + right_derive: bool, + ) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = new_batch_split_region_request( + vec![propose_key.to_vec()], + vec![split_id], + right_derive, + ); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + let (msg, sub) = PeerMsg::admin_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + let meta = router + .must_query_debug_info(split_region_id, Duration::from_secs(1)) + .unwrap(); + let epoch = &meta.region_state.epoch; + let buckets = vec![Bucket { + keys: vec![b"1".to_vec(), b"2".to_vec()], + size: 100, + }]; + let mut region_epoch = kvproto::metapb::RegionEpoch::default(); + region_epoch.set_conf_ver(epoch.conf_ver); + region_epoch.set_version(epoch.version); + router.refresh_bucket(split_region_id, region_epoch, buckets); + } +} + +pub mod merge_helper { + use std::{thread, time::Duration}; + + use futures::executor::block_on; + use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}, + }; + use raftstore_v2::router::PeerMsg; + + use super::Cluster; + + pub fn merge_region( + cluster: &Cluster, + store_offset: usize, + source: metapb::Region, + source_peer: metapb::Peer, + target: metapb::Region, + check: bool, + ) -> metapb::Region { + let region_id = source.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(source.get_region_epoch().clone()); + req.mut_header().set_peer(source_peer); + + let mut admin_req = AdminRequest::default(); + admin_req.set_cmd_type(AdminCmdType::PrepareMerge); + admin_req.mut_prepare_merge().set_target(target.clone()); + req.set_admin_request(admin_req); + + let (msg, sub) = PeerMsg::admin_command(req); + cluster.routers[store_offset].send(region_id, msg).unwrap(); + // They may communicate about trimmed status. + cluster.dispatch(region_id, vec![]); + let _ = block_on(sub.result()).unwrap(); + // We don't check the response because it needs to do a lot of checks async + // before actually proposing the command. + + // TODO: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); + + let mut new_target = cluster.routers[store_offset].region_detail(target.id); + if check { + for i in 1..=100 { + let r1 = new_target.get_start_key() == source.get_start_key() + && new_target.get_end_key() == target.get_end_key(); + let r2 = new_target.get_start_key() == target.get_start_key() + && new_target.get_end_key() == source.get_end_key(); + if r1 || r2 { + break; + } else if i == 100 { + panic!( + "still not merged after 5s: {:?} + {:?} != {:?}", + source, target, new_target + ); + } else { + thread::sleep(Duration::from_millis(50)); + new_target = cluster.routers[store_offset].region_detail(target.id); + } + } + } + new_target + } +} + +pub mod life_helper { + use std::assert_matches::assert_matches; + + use engine_traits::RaftEngine; + use kvproto::raft_serverpb::{ExtraMessageType, PeerState}; + + use super::*; + + pub fn assert_peer_not_exist(region_id: u64, peer_id: u64, router: &TestRouter) { + let timer = Instant::now(); + loop { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + match router.send(region_id, msg) { + Err(TrySendError::Disconnected(_)) => return, + Ok(()) => { + if let Some(m) = block_on(sub.result()) { + if m.raft_status.id != peer_id { + return; + } + } + } + Err(_) => (), + } + if timer.elapsed() < Duration::from_secs(3) { + thread::sleep(Duration::from_millis(10)); + } else { + panic!("peer of {} still exists", region_id); + } + } + } + + // TODO: make raft engine support more suitable way to verify range is empty. + /// Verify all states in raft engine are cleared. + pub fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb::Peer) { + let mut buf = vec![]; + raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); + assert!(buf.is_empty(), "{:?}", buf); + assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); + assert_matches!(raft_engine.get_apply_state(region_id, u64::MAX), Ok(None)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_matches!(region_state.get_state(), PeerState::Tombstone); + assert!( + region_state.get_region().get_peers().contains(peer), + "{:?}", + region_state + ); + } + + #[track_caller] + pub fn assert_valid_report(report: &RaftMessage, region_id: u64, peer_id: u64) { + assert_eq!( + report.get_extra_msg().get_type(), + ExtraMessageType::MsgGcPeerResponse + ); + assert_eq!(report.get_region_id(), region_id); + assert_eq!(report.get_from_peer().get_id(), peer_id); + } + + #[track_caller] + pub fn assert_tombstone_msg(msg: &RaftMessage, region_id: u64, peer_id: u64) { + assert_eq!(msg.get_region_id(), region_id); + assert_eq!(msg.get_to_peer().get_id(), peer_id); + assert!(msg.get_is_tombstone()); + } +} diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs new file mode 100644 index 00000000000..a4cdfda9179 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -0,0 +1,21 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(test)] +#![feature(assert_matches)] +#![feature(custom_test_frameworks)] +#![test_runner(test_util::run_tests)] + +// TODO: test conflict control in integration tests after split is supported. + +#[allow(dead_code)] +mod cluster; +mod test_basic_write; +mod test_conf_change; +mod test_life; +mod test_merge; +mod test_pd_heartbeat; +mod test_read; +mod test_split; +mod test_status; +mod test_trace_apply; +mod test_transfer_leader; diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs new file mode 100644 index 00000000000..cb8d71840cf --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -0,0 +1,131 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{assert_matches::assert_matches, time::Duration}; + +use engine_traits::{Peekable, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::raft_serverpb::RaftMessage; +use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use tikv_util::store::new_peer; + +use crate::cluster::{check_skip_wal, Cluster}; + +/// Test basic write flow. +#[test] +fn test_basic_write() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Good proposal should be committed. + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Store id should be checked. + let mut invalid_header = header.clone(); + invalid_header.set_peer(new_peer(3, 3)); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); + assert!( + resp.get_header().get_error().has_store_not_match(), + "{:?}", + resp + ); + + // Peer id should be checked. + invalid_header = header.clone(); + invalid_header.set_peer(new_peer(1, 1)); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); + assert!(resp.get_header().has_error(), "{:?}", resp); + + // Epoch should be checked. + invalid_header = header.clone(); + invalid_header + .mut_region_epoch() + .set_version(INIT_EPOCH_VER - 1); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); + assert!( + resp.get_header().get_error().has_epoch_not_match(), + "{:?}", + resp + ); + + // Term should be checked if set. + invalid_header = header.clone(); + invalid_header.set_term(1); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); + assert!( + resp.get_header().get_error().has_stale_command(), + "{:?}", + resp + ); + + // Too large message can cause regression and should be rejected. + let mut invalid_put = SimpleWriteEncoder::with_capacity(9 * 1024 * 1024); + invalid_put.put(CF_DEFAULT, b"key", &vec![0; 8 * 1024 * 1024]); + let resp = router.simple_write(2, header.clone(), invalid_put).unwrap(); + assert!( + resp.get_header().get_error().has_raft_entry_too_large(), + "{:?}", + resp + ); + + // Make it step down and follower should reject write. + let mut msg = Box::::default(); + msg.set_region_id(2); + msg.set_to_peer(new_peer(1, 3)); + msg.mut_region_epoch().set_conf_ver(INIT_EPOCH_CONF_VER); + msg.set_from_peer(new_peer(2, 4)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(4); + raft_message.set_term(8); + router.send_raft_message(msg).unwrap(); + let resp = router.simple_write(2, header, put).unwrap(); + assert!(resp.get_header().get_error().has_not_leader(), "{:?}", resp); +} + +#[test] +fn test_put_delete() { + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + let snap = router.stale_snapshot(2); + assert!(snap.get_value(b"key").unwrap().is_none()); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let snap = router.stale_snapshot(2); + assert_eq!(snap.get_value(b"key").unwrap().unwrap(), b"value"); + + let mut delete = SimpleWriteEncoder::with_capacity(64); + delete.delete(CF_DEFAULT, b"key"); + let (msg, mut sub) = PeerMsg::simple_write(header, delete.encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key"), Ok(None)); + + // Check if WAL is skipped for basic writes. + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + check_skip_wal(cached.latest().unwrap().as_inner().path()); +} diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs new file mode 100644 index 00000000000..c1c7861fd54 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -0,0 +1,234 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{self, time::Duration}; + +use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{ + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::{PeerState, RaftMessage}, +}; +use raft::prelude::{ConfChangeType, MessageType}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; +use tikv_util::store::{new_learner_peer, new_peer}; + +use crate::cluster::{check_skip_wal, Cluster}; + +#[test] +fn test_simple_change() { + let mut cluster = Cluster::with_node_count(2, None); + let (region_id, peer_id, offset_id) = (2, 10, 1); + + // 1. add learner on store-2 + add_learner(&cluster, offset_id, region_id, peer_id); + let meta = cluster.routers[0] + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + let match_index = meta.raft_apply.applied_index; + + // 2. write one kv after snapshot + let (key, val) = (b"key", b"value"); + write_kv(&cluster, region_id, key, val); + let meta = cluster.routers[1] + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + // the learner truncated index muse be equal the leader applied index and can + // read the new written kv. + assert_eq!(match_index, meta.raft_apply.truncated_state.index); + assert!(meta.raft_apply.applied_index >= match_index); + let snap = cluster.routers[offset_id].stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), val); + // 3. remove peer from store-2 + remove_peer(&cluster, offset_id, region_id, peer_id); + + // To avaid that some status doesn't clear after destroying, it can support to + // create peer by many times. + let repeat = 3; + for i in 1..repeat { + add_learner(&cluster, offset_id, region_id, peer_id + i); + write_kv(&cluster, region_id, key, val); + remove_peer(&cluster, offset_id, region_id, peer_id + i); + } + + add_learner(&cluster, offset_id, region_id, peer_id + repeat); + write_kv(&cluster, region_id, key, val); + let snap = cluster.routers[offset_id].stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), val); + + // TODO: check if the peer is removed once life trace is implemented or + // snapshot is implemented. + // Check if WAL is skipped for admin command. + let mut cached = cluster.node(0).tablet_registry().get(region_id).unwrap(); + check_skip_wal(cached.latest().unwrap().as_inner().path()); +} + +/// Test if a peer can be destroyed by conf change if logs after conf change are +/// also replicated. +#[test] +fn test_remove_by_conf_change() { + let cluster = Cluster::with_node_count(2, None); + let (region_id, peer_id, offset_id) = (2, 10, 1); + let mut req = add_learner(&cluster, offset_id, region_id, peer_id); + + // write one kv to make flow control replicated. + let (key, val) = (b"key", b"value"); + write_kv(&cluster, region_id, key, val); + + let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let (admin_msg, admin_sub) = PeerMsg::admin_command(req.clone()); + // write one kv after removal + let (key, val) = (b"key1", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + // Send them at the same time so they will be all sent to learner. + cluster.routers[0].send(region_id, admin_msg).unwrap(); + cluster.routers[0].send(region_id, msg).unwrap(); + let resp = block_on(admin_sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Dispatch messages so the learner will receive conf remove and write at the + // same time. + cluster.dispatch(region_id, vec![]); + cluster.routers[1].wait_flush(region_id, Duration::from_millis(300)); + // Wait for apply. + std::thread::sleep(Duration::from_millis(100)); + let raft_engine = &cluster.node(1).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_state(), PeerState::Tombstone); + assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); +} + +fn add_learner( + cluster: &Cluster, + offset_id: usize, + region_id: u64, + peer_id: u64, +) -> RaftCmdRequest { + let store_id = cluster.node(offset_id).id(); + let mut req = cluster.routers[0].new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let new_peer = new_learner_peer(store_id, peer_id); + admin_req.mut_change_peer().set_peer(new_peer.clone()); + let resp = cluster.routers[0] + .admin_command(region_id, req.clone()) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = cluster.routers[0] + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer]); + + // heartbeat will create a learner. + cluster.dispatch(region_id, vec![]); + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + let meta = cluster.routers[offset_id] + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, peer_id, "{:?}", meta); + + // Wait some time so snapshot can be generated. + std::thread::sleep(Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + req +} + +fn write_kv(cluster: &Cluster, region_id: u64, key: &[u8], val: &[u8]) { + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + std::thread::sleep(Duration::from_millis(1000)); + cluster.dispatch(region_id, vec![]); +} + +fn remove_peer(cluster: &Cluster, offset_id: usize, region_id: u64, peer_id: u64) { + let store_id = cluster.node(offset_id).id(); + let mut req = cluster.routers[0].new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + admin_req + .mut_change_peer() + .set_peer(new_learner_peer(store_id, peer_id)); + let resp = cluster.routers[0] + .admin_command(region_id, req.clone()) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + cluster.routers[offset_id] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + std::thread::sleep(Duration::from_millis(100)); + + let raft_engine = &cluster.node(offset_id).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_state(), PeerState::Tombstone); + assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); +} + +/// The peer should be able to respond an unknown sender, otherwise the +/// liveness of configuration change can't be guaranteed. +#[test] +fn test_unknown_peer() { + let cluster = Cluster::with_node_count(1, None); + + let router = &cluster.routers[0]; + let header = router.new_request_for(2).take_header(); + + // Create a fake message to see whether it's responded. + let from_peer = new_peer(10, 10); + let mut msg = Box::::default(); + msg.set_region_id(2); + msg.set_to_peer(header.get_peer().clone()); + msg.set_region_epoch(header.get_region_epoch().clone()); + msg.set_from_peer(from_peer.clone()); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(10); + raft_message.set_term(10); + + router.send_raft_message(msg).unwrap(); + router.wait_flush(2, Duration::from_secs(3)); + // If peer cache is updated correctly, it should be able to respond. + let msg = cluster.receiver(0).try_recv().unwrap(); + assert_eq!(*msg.get_to_peer(), from_peer); + assert_eq!(msg.get_from_peer(), header.get_peer()); + assert_eq!( + msg.get_message().get_msg_type(), + MessageType::MsgHeartbeatResponse + ); +} diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs new file mode 100644 index 00000000000..373763a53ef --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -0,0 +1,334 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; +use raft::prelude::{ConfChangeType, MessageType}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; +use tikv_util::store::{new_learner_peer, new_peer}; + +use crate::cluster::{ + life_helper::{ + assert_peer_not_exist, assert_tombstone, assert_tombstone_msg, assert_valid_report, + }, + Cluster, +}; + +/// Test a peer can be created by general raft message and destroyed tombstone +/// message. +#[test] +fn test_life_by_message() { + let mut cluster = Cluster::default(); + let router = &cluster.routers[0]; + let test_region_id = 4; + let test_peer_id = 5; + let test_leader_id = 6; + assert_peer_not_exist(test_region_id, test_peer_id, router); + + // Build a correct message. + let mut msg = Box::::default(); + msg.set_region_id(test_region_id); + msg.set_to_peer(new_peer(1, test_peer_id)); + msg.mut_region_epoch().set_conf_ver(1); + msg.set_from_peer(new_peer(2, test_leader_id)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(6); + raft_message.set_term(5); + + let assert_wrong = |f: &dyn Fn(&mut RaftMessage)| { + let mut wrong_msg = msg.clone(); + f(&mut wrong_msg); + router.send_raft_message(wrong_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + }; + + // Check mismatch store id. + assert_wrong(&|msg| msg.mut_to_peer().set_store_id(4)); + + // Check missing region epoch. + assert_wrong(&|msg| { + msg.take_region_epoch(); + }); + + // Correct message will create a peer, but the peer will not be initialized. + router.send_raft_message(msg.clone()).unwrap(); + let timeout = Duration::from_secs(3); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.region_state.id, test_region_id); + assert_eq!(meta.raft_status.id, test_peer_id); + assert_eq!(meta.region_state.tablet_index, 0); + // But leader should be set. + assert_eq!(meta.raft_status.soft_state.leader_id, test_leader_id); + + // The peer should survive restart. + cluster.restart(0); + let router = &cluster.routers[0]; + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + raft_engine.get_raft_state(test_region_id).unwrap().unwrap(); + raft_engine + .get_apply_state(test_region_id, 0) + .unwrap() + .unwrap(); + + // The peer should be destroyed by tombstone message. + let mut tombstone_msg = msg.clone(); + tombstone_msg.set_is_tombstone(true); + router.send_raft_message(tombstone_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); + + // Restart should not recreate tombstoned peer. + cluster.restart(0); + let router = &cluster.routers[0]; + assert_peer_not_exist(test_region_id, test_peer_id, router); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); +} + +#[test] +fn test_destroy_by_larger_id() { + let mut cluster = Cluster::default(); + let router = &cluster.routers[0]; + let test_region_id = 4; + let test_peer_id = 6; + let init_term = 5; + let mut msg = Box::::default(); + msg.set_region_id(test_region_id); + msg.set_to_peer(new_peer(1, test_peer_id)); + msg.mut_region_epoch().set_conf_ver(1); + msg.set_from_peer(new_peer(2, 8)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(MessageType::MsgHeartbeat); + raft_message.set_from(6); + raft_message.set_term(init_term); + // Create the peer. + router.send_raft_message(msg.clone()).unwrap(); + // There must be heartbeat response. + let hb = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_eq!( + hb.get_message().get_msg_type(), + MessageType::MsgHeartbeatResponse + ); + + let timeout = Duration::from_secs(3); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id); + + // Smaller ID should be ignored. + let mut smaller_id_msg = msg; + smaller_id_msg.set_to_peer(new_peer(1, test_peer_id - 1)); + smaller_id_msg.mut_message().set_term(init_term + 1); + router.send_raft_message(smaller_id_msg.clone()).unwrap(); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id); + assert_eq!(meta.raft_status.hard_state.term, init_term); + cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + + // Smaller ID tombstone message should trigger report. + let mut smaller_id_tombstone_msg = smaller_id_msg.clone(); + smaller_id_tombstone_msg.set_is_tombstone(true); + router.send_raft_message(smaller_id_tombstone_msg).unwrap(); + let report = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_valid_report(&report, test_region_id, test_peer_id - 1); + + // Larger ID should trigger destroy. + let mut larger_id_msg = smaller_id_msg; + larger_id_msg.set_to_peer(new_peer(1, test_peer_id + 1)); + router.send_raft_message(larger_id_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id + 1); + assert_eq!(meta.raft_status.hard_state.term, init_term + 1); + + // New peer should survive restart. + cluster.restart(0); + let router = &cluster.routers[0]; + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id + 1); + assert_eq!(meta.raft_status.hard_state.term, init_term + 1); +} + +#[test] +fn test_gc_peer_request() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + let test_region_id = 4; + let test_peer_id = 5; + let test_leader_id = 6; + + let mut msg = Box::::default(); + msg.set_region_id(test_region_id); + msg.set_to_peer(new_peer(1, test_peer_id)); + msg.mut_region_epoch().set_conf_ver(1); + msg.set_from_peer(new_peer(2, test_leader_id)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(6); + raft_message.set_term(5); + + // Tombstone message should create the peer and then destroy it. + let mut tombstone_msg = msg.clone(); + tombstone_msg.set_is_tombstone(true); + router.send_raft_message(tombstone_msg.clone()).unwrap(); + cluster.routers[0].wait_flush(test_region_id, Duration::from_millis(300)); + assert_peer_not_exist(test_region_id, test_peer_id, router); + // Resend a normal message will not create the peer. + router.send_raft_message(msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + // Resend tombstone message should trigger report. + router.send_raft_message(tombstone_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + let report = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_valid_report(&report, test_region_id, test_peer_id); +} + +#[test] +fn test_gc_peer_response() { + let cluster = Cluster::with_node_count(2, None); + let region_id = 2; + let mut req = cluster.routers[0].new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let store_id = cluster.node(1).id(); + let new_peer = new_learner_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(new_peer.clone()); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert!(region_state.get_removed_records().is_empty()); + + let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let resp = cluster.routers[0] + .admin_command(region_id, req.clone()) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); + // Drain all existing messages. + while cluster.receiver(0).try_recv().is_ok() {} + + let mut msg = Box::::default(); + msg.set_region_id(region_id); + msg.set_to_peer(req.get_header().get_peer().clone()); + msg.set_from_peer(new_peer); + let receiver = &cluster.receiver(0); + for ty in &[MessageType::MsgRequestVote, MessageType::MsgRequestPreVote] { + msg.mut_message().set_msg_type(*ty); + cluster.routers[0].send_raft_message(msg.clone()).unwrap(); + let tombstone_msg = match receiver.recv_timeout(Duration::from_millis(300)) { + Ok(msg) => msg, + Err(e) => panic!("failed to receive tombstone message {:?}: {:?}", ty, e), + }; + assert_tombstone_msg(&tombstone_msg, region_id, 10); + } + // Non-vote message should not trigger tombstone. + msg.mut_message().set_msg_type(MessageType::MsgHeartbeat); + cluster.routers[0].send_raft_message(msg).unwrap(); + cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + + // GcTick should also trigger tombstone. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::GcPeer)) + .unwrap(); + let tombstone_msg = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_tombstone_msg(&tombstone_msg, region_id, 10); + + // First message to create the peer and destroy. + cluster.routers[1] + .send_raft_message(Box::new(tombstone_msg.clone())) + .unwrap(); + cluster.routers[1].wait_flush(region_id, Duration::from_millis(300)); + cluster + .receiver(1) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + // Send message should trigger tombstone report. + cluster.routers[1] + .send_raft_message(Box::new(tombstone_msg)) + .unwrap(); + let report = cluster + .receiver(1) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_valid_report(&report, region_id, 10); + cluster.routers[0] + .send_raft_message(Box::new(report)) + .unwrap(); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_removed_records().len(), 1); + // Tick should flush records gc. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::GcPeer)) + .unwrap(); + // Trigger a write to make sure records gc is finished. + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert!(region_state.get_removed_records().is_empty()); +} diff --git a/components/raftstore-v2/tests/integrations/test_merge.rs b/components/raftstore-v2/tests/integrations/test_merge.rs new file mode 100644 index 00000000000..7d9dbef720e --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_merge.rs @@ -0,0 +1,112 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{Peekable, RaftEngineReadOnly}; +use kvproto::metapb::{Peer, Region}; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use tikv_util::store::new_peer; + +use crate::cluster::{merge_helper::merge_region, split_helper::split_region, Cluster, TestRouter}; + +#[test] +fn test_merge() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let do_split = + |r: &mut TestRouter, region: Region, peer: &Peer, v: u64| -> (Region, Region, Peer) { + let rid = region.get_id(); + let old_region_state = raft_engine + .get_region_state(rid, u64::MAX) + .unwrap() + .unwrap(); + let new_peer = new_peer(store_id, peer.get_id() + 1); + let (lhs, rhs) = split_region( + r, + region, + peer.clone(), + rid + 1, + new_peer.clone(), + Some(format!("k{}{}", rid, v).as_bytes()), + Some(format!("k{}{}", rid + 1, v).as_bytes()), + format!("k{}", rid + 1).as_bytes(), + format!("k{}", rid + 1).as_bytes(), + false, + ); + let region_state = raft_engine + .get_region_state(rid, u64::MAX) + .unwrap() + .unwrap(); + assert!(region_state.get_tablet_index() > old_region_state.get_tablet_index()); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + old_region_state + .get_region() + .get_region_epoch() + .get_version() + + 1, + ); + let region_state = raft_engine + .get_region_state(rid + 1, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + (lhs, rhs, new_peer) + }; + + let region_1 = router.region_detail(2); + let peer_1 = region_1.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Split into 6. + let (region_1, region_2, peer_2) = do_split(router, region_1, &peer_1, 1); + let (region_2, region_3, peer_3) = do_split(router, region_2, &peer_2, 2); + let (region_3, region_4, peer_4) = do_split(router, region_3, &peer_3, 3); + let (region_4, region_5, peer_5) = do_split(router, region_4, &peer_4, 4); + let (region_5, region_6, peer_6) = do_split(router, region_5, &peer_5, 5); + drop(raft_engine); + // The last region version is smaller. + for (i, v) in [1, 2, 3, 4, 5, 5].iter().enumerate() { + let rid = region_1.get_id() + i as u64; + let snapshot = router.stale_snapshot(rid); + let key = format!("k{rid}{v}"); + assert!( + snapshot.get_value(key.as_bytes()).unwrap().is_some(), + "{} {:?}", + rid, + key + ); + } + + let region_2 = merge_region(&cluster, 0, region_1.clone(), peer_1, region_2, true); + { + let snapshot = cluster.routers[0].stale_snapshot(region_2.get_id()); + let key = format!("k{}1", region_1.get_id()); + assert!(snapshot.get_value(key.as_bytes()).unwrap().is_some()); + } + let region_5 = merge_region(&cluster, 0, region_6.clone(), peer_6, region_5, true); + { + let snapshot = cluster.routers[0].stale_snapshot(region_5.get_id()); + let key = format!("k{}5", region_6.get_id()); + assert!(snapshot.get_value(key.as_bytes()).unwrap().is_some()); + } + let region_3 = merge_region(&cluster, 0, region_2, peer_2, region_3, true); + let region_4 = merge_region(&cluster, 0, region_3, peer_3, region_4, true); + let region_5 = merge_region(&cluster, 0, region_4, peer_4, region_5, true); + + cluster.restart(0); + let snapshot = cluster.routers[0].stale_snapshot(region_5.get_id()); + for (i, v) in [1, 2, 3, 4, 5, 5].iter().enumerate() { + let rid = region_1.get_id() + i as u64; + let key = format!("k{rid}{v}"); + assert!( + snapshot.get_value(key.as_bytes()).unwrap().is_some(), + "{} {:?}", + rid, + key + ); + } +} diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs new file mode 100644 index 00000000000..679183735b6 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -0,0 +1,217 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{MiscExt, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; +use pd_client::PdClient; +use raftstore::coprocessor::Config as CopConfig; +use raftstore_v2::{ + router::{PeerMsg, PeerTick, StoreMsg, StoreTick}, + SimpleWriteEncoder, +}; +use tikv_util::{config::ReadableSize, store::new_peer}; + +use crate::cluster::{v2_default_config, Cluster}; + +#[test] +fn test_region_heartbeat() { + let region_id = 2; + let cluster = Cluster::with_node_count(1, None); + let router = &cluster.routers[0]; + + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(region_id, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(1, 3) + ); + + for _ in 0..5 { + let resp = block_on( + cluster + .node(0) + .pd_client() + .get_region_leader_by_id(region_id), + ) + .unwrap(); + if let Some((region, peer)) = resp { + assert_eq!(region.get_id(), region_id); + assert_eq!(peer.get_id(), 3); + assert_eq!(peer.get_store_id(), 1); + return; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + panic!("failed to get region leader"); +} + +#[test] +fn test_store_heartbeat() { + let region_id = 2; + let cluster = Cluster::with_node_count(1, None); + let store_id = cluster.node(0).id(); + let router = &cluster.routers[0]; + // load data to split bucket. + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + let data = put.encode(); + let write_bytes = data.data_size(); + let (msg, sub) = PeerMsg::simple_write(header, data); + router.send(region_id, msg).unwrap(); + let _resp = block_on(sub.result()).unwrap(); + + // report store heartbeat to pd. + std::thread::sleep(std::time::Duration::from_millis(50)); + router + .store_router() + .send_control(StoreMsg::Tick(StoreTick::PdStoreHeartbeat)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); + if stats.get_start_time() > 0 { + assert_ne!(stats.get_capacity(), 0); + assert_ne!(stats.get_used_size(), 0); + assert_eq!(stats.get_keys_written(), 1); + assert!(stats.get_bytes_written() > write_bytes.try_into().unwrap()); + } +} + +#[test] +fn test_report_buckets() { + let region_id = 2; + let mut cop_cfg = CopConfig::default(); + cop_cfg.enable_region_bucket = Some(true); + cop_cfg.region_bucket_size = ReadableSize::kb(1); + let mut config = v2_default_config(); + config.region_split_check_diff = Some(ReadableSize::kb(1)); + let cluster = Cluster::with_cop_cfg(Some(config), cop_cfg); + let store_id = cluster.node(0).id(); + let router = &cluster.routers[0]; + + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(store_id, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(region_id, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(store_id, 3) + ); + router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + + // load data to split bucket. + let mut suffix = String::from(""); + for _ in 0..200 { + suffix.push_str("fake "); + } + + let repeat: u64 = 10; + let bytes = write_keys(&cluster, region_id, &suffix, repeat.try_into().unwrap()); + // To find the split keys, it should flush memtable manually. + let mut cached = cluster.node(0).tablet_registry().get(region_id).unwrap(); + cached.latest().unwrap().flush_cf(CF_DEFAULT, true).unwrap(); + // send split region check to split bucket. + router + .send(region_id, PeerMsg::Tick(PeerTick::SplitRegionCheck)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + // report buckets to pd. + router + .send(region_id, PeerMsg::Tick(PeerTick::ReportBuckets)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let resp = block_on(cluster.node(0).pd_client().get_buckets_by_id(region_id)).unwrap(); + let mut buckets_tmp = vec![]; + let mut bucket_ranges = vec![]; + if let Some(buckets) = resp { + assert!(buckets.get_keys().len() > 2); + assert_eq!(buckets.get_region_id(), region_id); + let write_bytes = buckets.get_stats().get_write_bytes(); + let write_keys = buckets.get_stats().get_write_keys(); + for i in 0..buckets.keys.len() - 1 { + assert!(write_bytes[i] >= bytes); + assert!(write_keys[i] >= repeat); + } + for i in 0..buckets.keys.len() - 1 { + buckets_tmp.push(raftstore::store::Bucket::default()); + let bucket_range = + raftstore::store::BucketRange(buckets.keys[i].clone(), buckets.keys[i + 1].clone()); + bucket_ranges.push(bucket_range); + } + } + + // report buckets to pd again, the write bytes and keys should be zero. + router + .send(region_id, PeerMsg::Tick(PeerTick::ReportBuckets)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let resp = block_on(cluster.node(0).pd_client().get_buckets_by_id(region_id)).unwrap(); + if let Some(buckets) = resp { + assert_eq!(buckets.get_region_id(), region_id); + let write_bytes = buckets.get_stats().get_write_bytes(); + let write_keys = buckets.get_stats().get_write_keys(); + for i in 0..buckets.keys.len() - 1 { + assert!(write_bytes[i] == 0); + assert!(write_keys[i] == 0); + } + } + + // send the same region buckets to refresh which needs to merge the last. + let resp = block_on(cluster.node(0).pd_client().get_region_by_id(region_id)).unwrap(); + if let Some(region) = resp { + let region_epoch = region.get_region_epoch().clone(); + for _ in 0..2 { + let msg = PeerMsg::RefreshRegionBuckets { + region_epoch: region_epoch.clone(), + buckets: buckets_tmp.clone(), + bucket_ranges: Some(bucket_ranges.clone()), + }; + router.send(region_id, msg).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + } + } + // report buckets to pd again, the write bytes and keys should be zero. + router + .send(region_id, PeerMsg::Tick(PeerTick::ReportBuckets)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let resp = block_on(cluster.node(0).pd_client().get_buckets_by_id(region_id)).unwrap(); + if let Some(buckets) = resp { + assert_eq!(buckets.get_region_id(), region_id); + let write_bytes = buckets.get_stats().get_write_bytes(); + let write_keys = buckets.get_stats().get_write_keys(); + assert_eq!(write_bytes.len(), 1); + assert_eq!(write_keys.len(), 1); + } + + fn write_keys(cluster: &Cluster, region_id: u64, suffix: &str, repeat: usize) -> u64 { + let router = &cluster.routers[0]; + let header = Box::new(router.new_request_for(region_id).take_header()); + for i in 0..repeat { + let mut put = SimpleWriteEncoder::with_capacity(64); + let mut key = format!("key-{}", i); + key.push_str(suffix); + put.put(CF_DEFAULT, key.as_bytes(), b"value"); + let (msg, sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); + router.send(region_id, msg).unwrap(); + let _resp = block_on(sub.result()).unwrap(); + } + ((suffix.as_bytes().len() + 10) * repeat) + .try_into() + .unwrap() + } +} diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs new file mode 100644 index 00000000000..f9575ff8da1 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -0,0 +1,179 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::CF_DEFAULT; +use futures::executor::block_on; +use kvproto::raft_cmdpb::{CmdType, Request}; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use tikv_util::{config::ReadableDuration, store::new_peer}; +use txn_types::WriteBatchFlags; + +use crate::cluster::{v2_default_config, Cluster}; + +#[test] +fn test_read_index() { + let mut config = v2_default_config(); + config.raft_store_max_leader_lease = ReadableDuration::millis(150); + let cluster = Cluster::with_config(config); + let router = &cluster.routers[0]; + std::thread::sleep(std::time::Duration::from_millis(200)); + let region_id = 2; + let mut req = router.new_request_for(region_id); + let mut request_inner = Request::default(); + request_inner.set_cmd_type(CmdType::Snap); + request_inner.mut_read_index(); + req.mut_requests().push(request_inner); + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + assert_eq!(resp.read_index, 6); // single node commited index should be 6. + + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + // Since it's still with the lease, read index will be skipped. + assert_eq!(resp.read_index, 0); + + std::thread::sleep(std::time::Duration::from_millis(200)); + // the read lease should be expired + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + assert_eq!(resp.read_index, 6); + + std::thread::sleep(std::time::Duration::from_millis(200)); + let read_req = req.clone(); + // the read lease should be expired and renewed by write + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + let res = router.query(region_id, read_req).unwrap(); + let resp = res.read().unwrap(); + assert_eq!(resp.read_index, 0); +} + +#[test] +fn test_snap_without_read_index() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + std::thread::sleep(std::time::Duration::from_millis(200)); + let region_id = 2; + let mut req = router.new_request_for(region_id); + let mut request_inner = Request::default(); + request_inner.set_cmd_type(CmdType::Snap); + req.mut_requests().push(request_inner); + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + // When it becomes leader, it will get a lease automatically because of empty + // entry. + assert_eq!(resp.read_index, 0); + + // run with header read_quorum + req.mut_header().set_read_quorum(true); + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + // even the lease is valid, it should run read index + assert_eq!(resp.read_index, 6); + + // TODO: add more test when write is implemented. +} + +#[test] +fn test_query_with_write_cmd() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + std::thread::sleep(std::time::Duration::from_millis(200)); + let region_id = 2; + let mut req = router.new_request_for(2); + + for write_cmd in [ + CmdType::Prewrite, + CmdType::Delete, + CmdType::DeleteRange, + CmdType::Put, + CmdType::IngestSst, + ] { + let mut request_inner = Request::default(); + request_inner.set_cmd_type(write_cmd); + req.mut_requests().push(request_inner); + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read(); + assert!(resp.is_none()); + let error_resp = res.response().unwrap(); + assert!(error_resp.get_header().has_error()); + req.clear_requests(); + } +} + +#[test] +fn test_snap_with_invalid_parameter() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + std::thread::sleep(std::time::Duration::from_millis(200)); + let region_id = 2; + let mut req = router.new_request_for(region_id); + let mut request_inner = Request::default(); + request_inner.set_cmd_type(CmdType::Snap); + req.mut_requests().push(request_inner); + + // store_id is incorrect; + let mut invalid_req = req.clone(); + invalid_req.mut_header().set_peer(new_peer(2, 3)); + let res = router.query(region_id, invalid_req).unwrap(); + let error_resp = res.response().unwrap(); + assert!(error_resp.get_header().has_error()); + + // run again, with incorrect peer_id + let mut invalid_req = req.clone(); + invalid_req.mut_header().set_peer(new_peer(1, 4)); + let res = router.query(region_id, invalid_req).unwrap(); + let error_resp = res.response().unwrap(); + assert!(error_resp.get_header().has_error()); + + // run with stale term + let mut invalid_req = req.clone(); + invalid_req.mut_header().set_term(1); + let res = router.query(region_id, invalid_req).unwrap(); + let error_resp = res.response().unwrap(); + assert!(error_resp.get_header().has_error()); + + // run with stale read + let mut invalid_req = req.clone(); + invalid_req + .mut_header() + .set_flags(WriteBatchFlags::STALE_READ.bits()); + let res = router.query(region_id, invalid_req).unwrap(); + let error_resp = res.response().unwrap(); + assert!(error_resp.get_header().has_error()); + + // run again with invalid region_epoch + let mut invalid_req = req.clone(); + let invalid_ver = req.get_header().get_region_epoch().get_version() + 1; + invalid_req + .mut_header() + .mut_region_epoch() + .set_version(invalid_ver); + let res = router.query(region_id, invalid_req).unwrap(); + let error_resp = res.response().unwrap(); + assert!(error_resp.get_header().has_error()); +} + +#[test] +fn test_local_read() { + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; + std::thread::sleep(std::time::Duration::from_millis(200)); + let region_id = 2; + let mut req = router.new_request_for(region_id); + let mut request_inner = Request::default(); + request_inner.set_cmd_type(CmdType::Snap); + req.mut_requests().push(request_inner); + + block_on(async { router.snapshot(req.clone()).await.unwrap() }); + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + // The read index will be 0 as the retry process in the `get_snapshot` will + // renew the lease. + assert_eq!(resp.read_index, 0); +} diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs new file mode 100644 index 00000000000..9dab98be598 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -0,0 +1,197 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{Peekable, RaftEngineReadOnly, CF_RAFT}; +use raftstore::store::{INIT_EPOCH_VER, RAFT_INIT_LOG_INDEX}; +use tikv_util::store::new_peer; +use txn_types::{Key, TimeStamp}; + +use crate::cluster::{split_helper::split_region, Cluster}; + +#[test] +fn test_split() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let region_2 = 2; + let region = router.region_detail(region_2); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(region_2, Duration::from_secs(3)); + + // Region 2 ["", ""] + // -> Region 2 ["", "k22"] + // Region 1000 ["k22", ""] peer(1, 10) + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + let (left, mut right) = split_region( + router, + region, + peer.clone(), + 1000, + new_peer(store_id, 10), + Some(b"k11"), + Some(b"k33"), + b"k22", + b"k22", + false, + ); + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); + assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 1 + ); + let region_state0 = raft_engine + .get_region_state(region_2, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state0); + let flushed_index = raft_engine + .get_flushed_index(region_2, CF_RAFT) + .unwrap() + .unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); + + // Region 2 ["", "k22"] + // -> Region 2 ["", "k11"] + // Region 1001 ["k11", "k22"] peer(1, 11) + let _ = split_region( + router, + left, + peer, + 1001, + new_peer(store_id, 11), + Some(b"k00"), + Some(b"k11"), + b"k11", + b"k11", + false, + ); + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); + assert_ne!( + region_state.get_tablet_index(), + region_state0.get_tablet_index() + ); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 2 + ); + let region_state1 = raft_engine + .get_region_state(region_2, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state1); + let flushed_index = raft_engine + .get_flushed_index(region_2, CF_RAFT) + .unwrap() + .unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); + + // Region 1000 ["k22", ""] peer(1, 10) + // -> Region 1000 ["k22", "k33"] peer(1, 10) + // Region 1002 ["k33", ""] peer(1, 12) + let region_1000 = 1000; + let region_state = raft_engine + .get_region_state(region_1000, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + right = split_region( + router, + right, + new_peer(store_id, 10), + 1002, + new_peer(store_id, 12), + Some(b"k22"), + Some(b"k33"), + b"k33", + b"k33", + false, + ) + .1; + let region_state = raft_engine + .get_region_state(region_1000, u64::MAX) + .unwrap() + .unwrap(); + assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 2 + ); + let region_state2 = raft_engine + .get_region_state(region_1000, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state2); + let flushed_index = raft_engine + .get_flushed_index(region_1000, CF_RAFT) + .unwrap() + .unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); + + // 1002 -> 1002, 1003 + let split_key = Key::from_raw(b"k44").append_ts(TimeStamp::zero()); + let actual_split_key = split_key.clone().truncate_ts().unwrap(); + split_region( + router, + right, + new_peer(store_id, 12), + 1003, + new_peer(store_id, 13), + Some(b"k33"), + Some(b"k55"), + split_key.as_encoded(), + actual_split_key.as_encoded(), + false, + ); + + // Split should survive restart. + drop(raft_engine); + cluster.restart(0); + let region_and_key = vec![ + (2, b"k00"), + (1000, b"k22"), + (1001, b"k11"), + (1002, b"k33"), + (1003, b"k55"), + ]; + for (region_id, key) in region_and_key { + let snapshot = cluster.routers[0].stale_snapshot(region_id); + assert!( + snapshot.get_value(key).unwrap().is_some(), + "{} {:?}", + region_id, + key + ); + } +} + +// TODO: test split race with +// - created peer +// - created peer with pending snapshot +// - created peer with persisting snapshot +// - created peer with persisted snapshot diff --git a/components/raftstore-v2/tests/integrations/test_status.rs b/components/raftstore-v2/tests/integrations/test_status.rs new file mode 100644 index 00000000000..59c23c4180f --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_status.rs @@ -0,0 +1,49 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +#[test] +fn test_status() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(2, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(1, 3) + ); + + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionDetail); + let res = router.query(2, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + let detail = status_resp.get_region_detail(); + assert_eq!(*detail.get_leader(), new_peer(1, 3)); + let region = detail.get_region(); + assert_eq!(region.get_id(), 2); + assert!(region.get_start_key().is_empty()); + assert!(region.get_end_key().is_empty()); + assert_eq!(*region.get_peers(), vec![new_peer(1, 3)]); + assert_eq!(region.get_region_epoch().get_version(), 1); + assert_eq!(region.get_region_epoch().get_conf_ver(), 1); + + // Invalid store id should return error. + req.mut_header().mut_peer().set_store_id(4); + let res = router.query(2, req).unwrap(); + let resp = res.response().unwrap(); + assert!( + resp.get_header().get_error().has_store_not_match(), + "{:?}", + resp + ); + + // TODO: add a peer then check for region change and leadership change. +} diff --git a/components/raftstore-v2/tests/integrations/test_trace_apply.rs b/components/raftstore-v2/tests/integrations/test_trace_apply.rs new file mode 100644 index 00000000000..71682ff52a4 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_trace_apply.rs @@ -0,0 +1,217 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{path::Path, time::Duration}; + +use engine_traits::{DbOptionsExt, MiscExt, Peekable, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; +use futures::executor::block_on; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + +use crate::cluster::Cluster; + +fn count_file(path: &Path, pat: impl Fn(&Path) -> bool) -> usize { + let mut count = 0; + for path in std::fs::read_dir(path).unwrap() { + if pat(&path.unwrap().path()) { + count += 1; + } + } + count +} + +fn count_sst(path: &Path) -> usize { + count_file(path, |path| { + path.extension().map_or(false, |ext| ext == "sst") + }) +} + +fn count_info_log(path: &Path) -> usize { + count_file(path, |path| { + path.file_name() + .unwrap() + .to_string_lossy() + .starts_with("LOG") + }) +} + +/// Test if data will be recovered correctly after being restarted. +#[test] +fn test_data_recovery() { + let mut cluster = Cluster::default(); + let registry = cluster.node(0).tablet_registry(); + let tablet_2_path = registry.tablet_path(2, RAFT_INIT_LOG_INDEX); + // The rocksdb is a bootstrapped tablet, so it will be opened and closed in + // bootstrap, and then open again in fsm initialization. + assert_eq!(count_info_log(&tablet_2_path), 2); + let router = &mut cluster.routers[0]; + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Write 100 keys to default CF and not flush. + let header = Box::new(router.new_request_for(2).take_header()); + for i in 0..100 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_DEFAULT, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + router + .send(2, PeerMsg::simple_write(header.clone(), put.encode()).0) + .unwrap(); + } + + // Write 100 keys to write CF and flush half. + let mut sub = None; + for i in 0..50 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_WRITE, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + let (msg, s) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + sub = Some(s); + } + let resp = block_on(sub.take().unwrap().result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cf(CF_WRITE, true).unwrap(); + let router = &mut cluster.routers[0]; + for i in 50..100 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_WRITE, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + router + .send(2, PeerMsg::simple_write(header.clone(), put.encode()).0) + .unwrap(); + } + + // Write 100 keys to lock CF and flush all. + for i in 0..100 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_LOCK, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + let (msg, s) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + sub = Some(s); + } + let resp = block_on(sub.take().unwrap().result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cf(CF_LOCK, true).unwrap(); + + // Make sure all keys must be written. + let router = &mut cluster.routers[0]; + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + let registry = cluster.node(0).tablet_registry(); + cached = registry.get(2).unwrap(); + cached + .latest() + .unwrap() + .set_db_options(&[("avoid_flush_during_shutdown", "true")]) + .unwrap(); + drop((snap, cached)); + + cluster.restart(0); + + let registry = cluster.node(0).tablet_registry(); + cached = registry.get(2).unwrap(); + cached + .latest() + .unwrap() + .set_db_options(&[("avoid_flush_during_shutdown", "true")]) + .unwrap(); + let router = &mut cluster.routers[0]; + + // Write another key to ensure all data are recovered. + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key101", b"value101"); + let resp = router.simple_write(2, header, put).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // After being restarted, all unflushed logs should be applied again. So there + // should be no missing data. + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + + // There is a restart, so LOG file should be rotate. + assert_eq!(count_info_log(&tablet_2_path), 3); + // We only trigger Flush twice, so there should be only 2 files. And because WAL + // is disabled, so when rocksdb is restarted, there should be no WAL to recover, + // so no additional flush will be triggered. + assert_eq!(count_sst(&tablet_2_path), 2); + + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cfs(DATA_CFS, true).unwrap(); + + // Although all CFs are triggered again, but recovery should only write: + // 1. [0, 101) to CF_DEFAULT + // 2. [50, 100) to CF_WRITE + // + // So there will be only 2 memtables to be flushed. + assert_eq!(count_sst(&tablet_2_path), 4); + + drop((snap, cached)); + + cluster.restart(0); + + let router = &mut cluster.routers[0]; + + assert_eq!(count_info_log(&tablet_2_path), 4); + // Because data is flushed before restarted, so all data can be read + // immediately. + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + // Trigger flush again. + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cfs(DATA_CFS, true).unwrap(); + + // There is no recovery, so there should be nothing to flush. + assert_eq!(count_sst(&tablet_2_path), 4); +} diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs new file mode 100644 index 00000000000..18d81ef16aa --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -0,0 +1,160 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{assert_matches::assert_matches, time::Duration}; + +use engine_traits::{Peekable, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, TransferLeaderRequest}, +}; +use raft::prelude::ConfChangeType; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +fn put_data( + region_id: u64, + cluster: &mut Cluster, + node_off: usize, + node_off_for_verify: usize, + key: &[u8], +) { + let mut router = &mut cluster.routers[node_off]; + + router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + + // router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); + + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, b"value"); + let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(10)); + cluster.dispatch(region_id, vec![]); + assert!(block_on(sub.wait_proposed())); + + std::thread::sleep(std::time::Duration::from_millis(10)); + cluster.dispatch(region_id, vec![]); + // triage send snapshot + std::thread::sleep(std::time::Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + assert!(block_on(sub.wait_committed())); + + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + router = &mut cluster.routers[node_off]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); + + // Because of skip bcast commit, the data should not be applied yet. + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); + // Trigger heartbeat explicitly to commit on follower. + router = &mut cluster.routers[node_off]; + for _ in 0..2 { + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + } + cluster.dispatch(region_id, vec![]); + std::thread::sleep(std::time::Duration::from_millis(100)); + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); +} + +pub fn must_transfer_leader( + cluster: &Cluster, + region_id: u64, + from_off: usize, + to_off: usize, + to_peer: metapb::Peer, +) { + let router = &cluster.routers[from_off]; + let router2 = &cluster.routers[to_off]; + let mut req = router.new_request_for(region_id); + let mut transfer_req = TransferLeaderRequest::default(); + transfer_req.set_peer(to_peer.clone()); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::TransferLeader); + admin_req.set_transfer_leader(transfer_req); + let resp = router.admin_command(region_id, req).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + cluster.dispatch(region_id, vec![]); + + let meta = router + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.soft_state.leader_id, to_peer.id); + let meta = router2 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.soft_state.leader_id, to_peer.id); +} + +#[test] +fn test_transfer_leader() { + let mut cluster = Cluster::with_node_count(3, None); + let region_id = 2; + let router0 = &cluster.routers[0]; + + let mut req = router0.new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddNode); + let store_id = cluster.node(1).id(); + let peer1 = new_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(peer1.clone()); + let req_clone = req.clone(); + let resp = router0.admin_command(region_id, req_clone).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = router0 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer, peer1.clone()]); + let peer0_id = meta.raft_status.id; + + // So heartbeat will create a learner. + cluster.dispatch(region_id, vec![]); + let router1 = &cluster.routers[1]; + let meta = router1 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(peer0_id, meta.raft_status.soft_state.leader_id); + assert_eq!(meta.raft_status.id, peer1.id, "{:?}", meta); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + cluster.dispatch(region_id, vec![]); + + // Ensure follower has latest entries before transfer leader. + put_data(region_id, &mut cluster, 0, 1, b"key1"); + + // Perform transfer leader + must_transfer_leader(&cluster, region_id, 0, 1, peer1); + + // Before transfer back to peer0, put some data again. + put_data(region_id, &mut cluster, 1, 0, b"key2"); + + // Perform transfer leader + let store_id = cluster.node(0).id(); + must_transfer_leader(&cluster, region_id, 1, 0, new_peer(store_id, peer0_id)); +} diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 01519444b92..cbf943800ee 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "engine_rocks"] failpoints = ["fail/failpoints"] testexport = [] test-engine-kv-rocksdb = [ @@ -28,70 +28,74 @@ cloud-gcp = ["sst_importer/cloud-gcp"] cloud-azure = ["sst_importer/cloud-azure"] [dependencies] -batch-system = { path = "../batch-system", default-features = false } +batch-system = { workspace = true } bitflags = "1.0.1" byteorder = "1.2" bytes = "1.0" -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crc32fast = "1.2" crossbeam = "0.8" derivative = "2" -encryption = { path = "../encryption", default-features = false } +encryption = { workspace = true } +engine_rocks = { workspace = true, optional = true } # Should be [dev-dependencies] but we need to control the features # https://github.com/rust-lang/cargo/issues/6915 -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } +engine_test = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } fs2 = "0.4" futures = "0.3" futures-util = { version = "0.3.1", default-features = false, features = ["io"] } getset = "0.1" grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } -into_other = { path = "../into_other", default-features = false } +into_other = { workspace = true } itertools = "0.10" -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } -memory_trace_macros = { path = "../memory_trace_macros" } -online_config = { path = "../online_config" } +log_wrappers = { workspace = true } +memory_trace_macros = { workspace = true } +online_config = { workspace = true } openssl = "0.10" ordered-float = "2.6" parking_lot = "0.12" -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0", default-features = false } rand = "0.8.3" -resource_metering = { path = "../resource_metering" } +resource_control = { workspace = true } +resource_metering = { workspace = true } serde = "1.0" serde_derive = "1.0" serde_with = "1.4" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } smallvec = "1.4" -sst_importer = { path = "../sst_importer", default-features = false } +sst_importer = { workspace = true } tempfile = "3.0" thiserror = "1.0" -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_datatype = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } -txn_types = { path = "../txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] -encryption_export = { path = "../encryption/export", default-features = false } -engine_panic = { path = "../engine_panic", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -panic_hook = { path = "../panic_hook" } -test_sst_importer = { path = "../test_sst_importer", default-features = false } +encryption_export = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +panic_hook = { workspace = true } +test_sst_importer = { workspace = true } diff --git a/components/raftstore/src/compacted_event_sender.rs b/components/raftstore/src/compacted_event_sender.rs new file mode 100644 index 00000000000..99ba70a0512 --- /dev/null +++ b/components/raftstore/src/compacted_event_sender.rs @@ -0,0 +1,23 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::Mutex; + +use engine_rocks::{CompactedEventSender, RocksCompactedEvent, RocksEngine}; +use engine_traits::RaftEngine; +use tikv_util::error_unknown; + +use crate::store::{fsm::store::RaftRouter, StoreMsg}; + +// raftstore v1's implementation +pub struct RaftRouterCompactedEventSender { + pub router: Mutex>, +} + +impl CompactedEventSender for RaftRouterCompactedEventSender { + fn send(&self, event: RocksCompactedEvent) { + let router = self.router.lock().unwrap(); + let event = StoreMsg::CompactedEvent(event); + if let Err(e) = router.send_control(event) { + error_unknown!(?e; "send compaction finished event to raftstore failed"); + } + } +} diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 0f553c879a2..c05a8e89a41 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -25,7 +25,7 @@ pub struct Config { /// [b,c), [c,d) will be region_split_size (maybe a little larger). /// by default, region_max_size = region_split_size * 2 / 3. pub region_max_size: Option, - pub region_split_size: ReadableSize, + pub region_split_size: Option, /// When the number of keys in region [a,e) meets the region_max_keys, /// it will be split into two several regions [a,b), [b,c), [c,d), [d,e). @@ -46,12 +46,15 @@ pub struct Config { pub perf_level: PerfLevel, // enable subsplit ranges (aka bucket) within the region - pub enable_region_bucket: bool, + pub enable_region_bucket: Option, pub region_bucket_size: ReadableSize, // region size threshold for using approximate size instead of scan pub region_size_threshold_for_approximate: ReadableSize, + #[online_config(skip)] + pub prefer_approximate_bucket: bool, // ratio of region_bucket_size. (0, 0.5) - // The region_bucket_merge_size_ratio * region_bucket_size is threshold to merge with its left neighbor bucket + // The region_bucket_merge_size_ratio * region_bucket_size is threshold to merge with its left + // neighbor bucket pub region_bucket_merge_size_ratio: f64, } @@ -67,7 +70,9 @@ pub enum ConsistencyCheckMethod { } /// Default region split size. -pub const SPLIT_SIZE_MB: u64 = 96; +pub const SPLIT_SIZE: ReadableSize = ReadableSize::mb(96); +pub const RAFTSTORE_V2_SPLIT_SIZE: ReadableSize = ReadableSize::gb(10); + /// Default batch split limit. pub const BATCH_SPLIT_LIMIT: u64 = 10; @@ -77,58 +82,104 @@ pub const DEFAULT_REGION_BUCKET_MERGE_SIZE_RATIO: f64 = 0.33; impl Default for Config { fn default() -> Config { - let split_size = ReadableSize::mb(SPLIT_SIZE_MB); Config { split_region_on_table: false, batch_split_limit: BATCH_SPLIT_LIMIT, - region_split_size: split_size, + region_split_size: None, region_max_size: None, region_split_keys: None, region_max_keys: None, consistency_check_method: ConsistencyCheckMethod::Mvcc, perf_level: PerfLevel::Uninitialized, - enable_region_bucket: false, + enable_region_bucket: None, region_bucket_size: DEFAULT_BUCKET_SIZE, region_size_threshold_for_approximate: DEFAULT_BUCKET_SIZE * BATCH_SPLIT_LIMIT / 2 * 3, region_bucket_merge_size_ratio: DEFAULT_REGION_BUCKET_MERGE_SIZE_RATIO, + prefer_approximate_bucket: true, } } } impl Config { + pub fn region_split_size(&self) -> ReadableSize { + self.region_split_size.unwrap_or(SPLIT_SIZE) + } + pub fn region_max_keys(&self) -> u64 { - let default_split_keys = self.region_split_size.as_mb_f64() * 10000.0; + let default_split_keys = self.region_split_size().as_mb_f64() * 10000.0; self.region_max_keys .unwrap_or(default_split_keys as u64 / 2 * 3) } pub fn region_max_size(&self) -> ReadableSize { self.region_max_size - .unwrap_or(self.region_split_size / 2 * 3) + .unwrap_or(self.region_split_size() / 2 * 3) } pub fn region_split_keys(&self) -> u64 { // Assume the average size of KVs is 100B. self.region_split_keys - .unwrap_or((self.region_split_size.as_mb_f64() * 10000.0) as u64) + .unwrap_or((self.region_split_size().as_mb_f64() * 10000.0) as u64) + } + + pub fn enable_region_bucket(&self) -> bool { + self.enable_region_bucket.unwrap_or(false) + } + + pub fn optimize_for(&mut self, raftstore_v2: bool) { + // overwrite the default region_split_size when it's multi-rocksdb + if self.region_split_size.is_none() { + if raftstore_v2 { + self.region_split_size = Some(RAFTSTORE_V2_SPLIT_SIZE); + } else { + self.region_split_size = Some(self.region_split_size()); + } + } + } + + fn validate_bucket_size(&self) -> Result<()> { + if self.region_split_size().0 < self.region_bucket_size.0 { + return Err(box_err!( + "region split size {} must >= region bucket size {}", + self.region_split_size().0, + self.region_bucket_size.0 + )); + } + if self.region_size_threshold_for_approximate.0 < self.region_bucket_size.0 { + return Err(box_err!( + "large region threshold size {} must >= region bucket size {}", + self.region_size_threshold_for_approximate.0, + self.region_bucket_size.0 + )); + } + if self.region_bucket_size.0 == 0 { + return Err(box_err!("region_bucket size cannot be 0.")); + } + if self.region_bucket_merge_size_ratio <= 0.0 || self.region_bucket_merge_size_ratio >= 0.5 + { + return Err(box_err!( + "region-bucket-merge-size-ratio should be 0 to 0.5 (not include both ends)." + )); + } + Ok(()) } pub fn validate(&mut self) -> Result<()> { if self.region_split_keys.is_none() { - self.region_split_keys = Some((self.region_split_size.as_mb_f64() * 10000.0) as u64); + self.region_split_keys = Some((self.region_split_size().as_mb_f64() * 10000.0) as u64); } match self.region_max_size { Some(region_max_size) => { - if region_max_size.0 < self.region_split_size.0 { + if region_max_size.0 < self.region_split_size().0 { return Err(box_err!( "region max size {} must >= split size {}", region_max_size.0, - self.region_split_size.0 + self.region_split_size().0 )); } } - None => self.region_max_size = Some(self.region_split_size / 2 * 3), + None => self.region_max_size = Some(self.region_split_size() / 2 * 3), } match self.region_max_keys { @@ -143,31 +194,13 @@ impl Config { } None => self.region_max_keys = Some(self.region_split_keys() / 2 * 3), } - if self.enable_region_bucket { - if self.region_split_size.0 < self.region_bucket_size.0 { - return Err(box_err!( - "region split size {} must >= region bucket size {}", - self.region_split_size.0, - self.region_bucket_size.0 - )); - } - if self.region_size_threshold_for_approximate.0 < self.region_bucket_size.0 { - return Err(box_err!( - "large region threshold size {} must >= region bucket size {}", - self.region_size_threshold_for_approximate.0, - self.region_bucket_size.0 - )); - } - if self.region_bucket_size.0 == 0 { - return Err(box_err!("region_bucket size cannot be 0.")); - } - if self.region_bucket_merge_size_ratio <= 0.0 - || self.region_bucket_merge_size_ratio >= 0.5 - { - return Err(box_err!( - "region-bucket-merge-size-ratio should be 0 to 0.5 (not include both ends)." - )); - } + let res = self.validate_bucket_size(); + // If it's OK to enable bucket, we will prefer to enable it if useful. + if let Ok(()) = res && self.enable_region_bucket.is_none() { + let useful = self.region_split_size() >= self.region_bucket_size * 2; + self.enable_region_bucket = Some(useful); + } else if let Err(e) = res && self.enable_region_bucket() { + return Err(e); } Ok(()) } @@ -204,35 +237,35 @@ mod tests { cfg = Config::default(); cfg.region_max_size = Some(ReadableSize(10)); - cfg.region_split_size = ReadableSize(20); - assert!(cfg.validate().is_err()); + cfg.region_split_size = Some(ReadableSize(20)); + cfg.validate().unwrap_err(); cfg = Config::default(); cfg.region_max_size = None; - cfg.region_split_size = ReadableSize(20); - assert!(cfg.validate().is_ok()); + cfg.region_split_size = Some(ReadableSize(20)); + cfg.validate().unwrap(); assert_eq!(cfg.region_max_size, Some(ReadableSize(30))); cfg = Config::default(); cfg.region_max_keys = Some(10); cfg.region_split_keys = Some(20); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); cfg = Config::default(); cfg.region_max_keys = None; cfg.region_split_keys = Some(20); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!(cfg.region_max_keys, Some(30)); cfg = Config::default(); - cfg.enable_region_bucket = false; - cfg.region_split_size = ReadableSize(20); + cfg.enable_region_bucket = Some(false); + cfg.region_split_size = Some(ReadableSize(20)); cfg.region_bucket_size = ReadableSize(30); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); cfg = Config::default(); - cfg.region_split_size = ReadableSize::mb(20); - assert!(cfg.validate().is_ok()); + cfg.region_split_size = Some(ReadableSize::mb(20)); + cfg.validate().unwrap(); assert_eq!(cfg.region_split_keys, Some(200000)); } } diff --git a/components/raftstore/src/coprocessor/consistency_check.rs b/components/raftstore/src/coprocessor/consistency_check.rs index 16770595405..2ebf27c963f 100644 --- a/components/raftstore/src/coprocessor/consistency_check.rs +++ b/components/raftstore/src/coprocessor/consistency_check.rs @@ -60,13 +60,11 @@ impl ConsistencyCheckObserver for Raw { fn compute_hash_on_raw(region: &Region, snap: &S) -> Result { let region_id = region.get_id(); let mut digest = crc32fast::Hasher::new(); - let mut cf_names = snap.cf_names(); - cf_names.sort_unstable(); let start_key = keys::enc_start_key(region); let end_key = keys::enc_end_key(region); - for cf in cf_names { - snap.scan_cf(cf, &start_key, &end_key, false, |k, v| { + for cf in snap.cf_names() { + snap.scan(cf, &start_key, &end_key, false, |k, v| { digest.update(k); digest.update(v); Ok(true) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 8c8b857a47b..0e45ef1d09d 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -1,20 +1,133 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] called by Fsm on_ready_compute_hash -use std::{marker::PhantomData, mem, ops::Deref}; +use std::{borrow::Cow, marker::PhantomData, mem, ops::Deref}; use engine_traits::{CfName, KvEngine}; use kvproto::{ - metapb::Region, + metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; use tikv_util::box_try; -use super::*; -use crate::store::CasualRouter; +use super::{split_observer::SplitObserver, *}; +use crate::store::BucketRange; + +/// A handle for coprocessor to schedule some command back to raftstore. +pub trait StoreHandle: Clone + Send { + fn update_approximate_size(&self, region_id: u64, size: u64); + fn update_approximate_keys(&self, region_id: u64, keys: u64); + fn ask_split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ); + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ); + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ); +} + +#[derive(Clone, Debug, PartialEq)] +pub enum SchedTask { + UpdateApproximateSize { + region_id: u64, + size: u64, + }, + UpdateApproximateKeys { + region_id: u64, + keys: u64, + }, + AskSplit { + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + }, + RefreshRegionBuckets { + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + }, + UpdateComputeHashResult { + region_id: u64, + index: u64, + hash: Vec, + context: Vec, + }, +} + +impl StoreHandle for std::sync::mpsc::SyncSender { + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.try_send(SchedTask::UpdateApproximateSize { region_id, size }); + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.try_send(SchedTask::UpdateApproximateKeys { region_id, keys }); + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + let _ = self.try_send(SchedTask::AskSplit { + region_id, + region_epoch, + split_keys, + source, + }); + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let _ = self.try_send(SchedTask::RefreshRegionBuckets { + region_id, + region_epoch, + buckets, + bucket_ranges, + }); + } + + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ) { + let _ = self.try_send(SchedTask::UpdateComputeHashResult { + region_id, + index, + context, + hash, + }); + } +} struct Entry { priority: u32, @@ -38,7 +151,7 @@ pub trait ClonableObserver: 'static + Send { } macro_rules! impl_box_observer { - ($name:ident, $ob: ident, $wrapper: ident) => { + ($name:ident, $ob:ident, $wrapper:ident) => { pub struct $name(Box + Send>); impl $name { pub fn new(observer: T) -> $name { @@ -82,7 +195,7 @@ macro_rules! impl_box_observer { // This is the same as impl_box_observer_g except $ob has a typaram macro_rules! impl_box_observer_g { - ($name:ident, $ob: ident, $wrapper: ident) => { + ($name:ident, $ob:ident, $wrapper:ident) => { pub struct $name(Box> + Send>); impl $name { pub fn new + Clone>(observer: T) -> $name { @@ -133,6 +246,11 @@ macro_rules! impl_box_observer_g { impl_box_observer!(BoxAdminObserver, AdminObserver, WrappedAdminObserver); impl_box_observer!(BoxQueryObserver, QueryObserver, WrappedQueryObserver); +impl_box_observer!( + BoxUpdateSafeTsObserver, + UpdateSafeTsObserver, + WrappedUpdateSafeTsObserver +); impl_box_observer!( BoxApplySnapshotObserver, ApplySnapshotObserver, @@ -143,6 +261,7 @@ impl_box_observer_g!( SplitCheckObserver, WrappedSplitCheckObserver ); +impl_box_observer!(BoxPdTaskObserver, PdTaskObserver, WrappedPdTaskObserver); impl_box_observer!(BoxRoleObserver, RoleObserver, WrappedRoleObserver); impl_box_observer!( BoxRegionChangeObserver, @@ -160,6 +279,7 @@ impl_box_observer_g!( ConsistencyCheckObserver, WrappedConsistencyCheckObserver ); +impl_box_observer!(BoxMessageObserver, MessageObserver, WrappedMessageObserver); /// Registry contains all registered coprocessors. #[derive(Clone)] @@ -176,6 +296,9 @@ where region_change_observers: Vec>, cmd_observers: Vec>>, read_index_observers: Vec>, + pd_task_observers: Vec>, + update_safe_ts_observers: Vec>, + message_observers: Vec>, // TODO: add endpoint } @@ -191,6 +314,9 @@ impl Default for Registry { region_change_observers: Default::default(), cmd_observers: Default::default(), read_index_observers: Default::default(), + pd_task_observers: Default::default(), + update_safe_ts_observers: Default::default(), + message_observers: Default::default(), } } } @@ -237,6 +363,10 @@ impl Registry { push!(priority, cco, self.consistency_check_observers); } + pub fn register_pd_task_observer(&mut self, priority: u32, ro: BoxPdTaskObserver) { + push!(priority, ro, self.pd_task_observers); + } + pub fn register_role_observer(&mut self, priority: u32, ro: BoxRoleObserver) { push!(priority, ro, self.role_observers); } @@ -252,10 +382,18 @@ impl Registry { pub fn register_read_index_observer(&mut self, priority: u32, rio: BoxReadIndexObserver) { push!(priority, rio, self.read_index_observers); } + pub fn register_update_safe_ts_observer(&mut self, priority: u32, qo: BoxUpdateSafeTsObserver) { + push!(priority, qo, self.update_safe_ts_observers); + } + + pub fn register_message_observer(&mut self, priority: u32, qo: BoxMessageObserver) { + push!(priority, qo, self.message_observers); + } } -/// A macro that loops over all observers and returns early when error is found or -/// bypass is set. `try_loop_ob` is expected to be used for hook that returns a `Result`. +/// A macro that loops over all observers and returns early when error is found +/// or bypass is set. `try_loop_ob` is expected to be used for hook that returns +/// a `Result`. macro_rules! try_loop_ob { ($r:expr, $obs:expr, $hook:ident, $($args:tt)*) => { loop_ob!(_imp _res, $r, $obs, $hook, $($args)*) @@ -321,10 +459,8 @@ where } impl CoprocessorHost { - pub fn new + Clone + Send + 'static>( - ch: C, - cfg: Config, - ) -> CoprocessorHost { + pub fn new(ch: C, cfg: Config) -> CoprocessorHost { + // TODO load coprocessors from configuration let mut registry = Registry::default(); registry.register_split_check_observer( 200, @@ -339,9 +475,20 @@ impl CoprocessorHost { 400, BoxSplitCheckObserver::new(TableCheckObserver::default()), ); + registry.register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); CoprocessorHost { registry, cfg } } + pub fn on_empty_cmd(&self, region: &Region, index: u64, term: u64) { + loop_ob!( + region, + &self.registry.query_observers, + on_empty_cmd, + index, + term, + ); + } + /// Call all propose hooks until bypass is set to true. pub fn pre_propose(&self, region: &Region, req: &mut RaftCmdRequest) -> Result<()> { if !req.has_admin_request() { @@ -406,6 +553,63 @@ impl CoprocessorHost { } } + // (index, term) is for the applying entry. + pub fn pre_exec(&self, region: &Region, cmd: &RaftCmdRequest, index: u64, term: u64) -> bool { + let mut ctx = ObserverContext::new(region); + if !cmd.has_admin_request() { + let query = cmd.get_requests(); + for observer in &self.registry.query_observers { + let observer = observer.observer.inner(); + if observer.pre_exec_query(&mut ctx, query, index, term) { + return true; + } + } + false + } else { + let admin = cmd.get_admin_request(); + for observer in &self.registry.admin_observers { + let observer = observer.observer.inner(); + if observer.pre_exec_admin(&mut ctx, admin, index, term) { + return true; + } + } + false + } + } + + /// `post_exec` should be called immediately after we executed one raft + /// command. It notifies observers side effects of this command before + /// execution of the next command, including req/resp, apply state, + /// modified region state, etc. Return true observers think a + /// persistence is necessary. + pub fn post_exec( + &self, + region: &Region, + cmd: &Cmd, + apply_state: &RaftApplyState, + region_state: &RegionState, + apply_ctx: &mut ApplyCtxInfo<'_>, + ) -> bool { + let mut ctx = ObserverContext::new(region); + if !cmd.response.has_admin_response() { + for observer in &self.registry.query_observers { + let observer = observer.observer.inner(); + if observer.post_exec_query(&mut ctx, cmd, apply_state, region_state, apply_ctx) { + return true; + } + } + false + } else { + for observer in &self.registry.admin_observers { + let observer = observer.observer.inner(); + if observer.post_exec_admin(&mut ctx, cmd, apply_state, region_state, apply_ctx) { + return true; + } + } + false + } + } + pub fn post_apply_plain_kvs_from_snapshot( &self, region: &Region, @@ -431,6 +635,47 @@ impl CoprocessorHost { ); } + pub fn should_pre_apply_snapshot(&self) -> bool { + for observer in &self.registry.apply_snapshot_observers { + let observer = observer.observer.inner(); + if observer.should_pre_apply_snapshot() { + return true; + } + } + false + } + + pub fn pre_apply_snapshot( + &self, + region: &Region, + peer_id: u64, + snap_key: &crate::store::SnapKey, + snap: Option<&crate::store::Snapshot>, + ) { + loop_ob!( + region, + &self.registry.apply_snapshot_observers, + pre_apply_snapshot, + peer_id, + snap_key, + snap, + ); + } + + pub fn post_apply_snapshot( + &self, + region: &Region, + peer_id: u64, + snap_key: &crate::store::SnapKey, + snap: Option<&crate::store::Snapshot>, + ) { + let mut ctx = ObserverContext::new(region); + for observer in &self.registry.apply_snapshot_observers { + let observer = observer.observer.inner(); + observer.post_apply_snapshot(&mut ctx, peer_id, snap_key, snap); + } + } + pub fn new_split_checker_host<'a>( &'a self, region: &Region, @@ -481,6 +726,15 @@ impl CoprocessorHost { Ok(hashes) } + pub fn on_compute_engine_size(&self) -> Option { + let mut store_size = None; + for observer in &self.registry.pd_task_observers { + let observer = observer.observer.inner(); + observer.on_compute_engine_size(&mut store_size); + } + store_size + } + pub fn on_role_change(&self, region: &Region, role_change: RoleChange) { loop_ob!( region, @@ -500,6 +754,51 @@ impl CoprocessorHost { ); } + /// `pre_persist` is called we we want to persist data or meta for a region. + /// For example, in `finish_for` and `commit`, + /// we will separately call `pre_persist` with is_finished = true/false. + /// By returning false, we reject this persistence. + pub fn pre_persist( + &self, + region: &Region, + is_finished: bool, + cmd: Option<&RaftCmdRequest>, + ) -> bool { + let mut ctx = ObserverContext::new(region); + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + if !observer.pre_persist(&mut ctx, is_finished, cmd) { + return false; + } + } + true + } + + /// Should be called everytime before we want to write apply state when + /// applying. Return a bool which indicates whether we can actually do + /// this write. + pub fn pre_write_apply_state(&self, region: &Region) -> bool { + let mut ctx = ObserverContext::new(region); + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + if !observer.pre_write_apply_state(&mut ctx) { + return false; + } + } + true + } + + /// Returns false if the message should not be stepped later. + pub fn on_raft_message(&self, msg: &RaftMessage) -> bool { + for observer in &self.registry.message_observers { + let observer = observer.observer.inner(); + if !observer.on_raft_message(msg) { + return false; + } + } + true + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -537,6 +836,16 @@ impl CoprocessorHost { } } + pub fn on_update_safe_ts(&self, region_id: u64, self_safe_ts: u64, leader_safe_ts: u64) { + if self.registry.query_observers.is_empty() { + return; + } + for observer in &self.registry.update_safe_ts_observers { + let observer = observer.observer.inner(); + observer.on_update_safe_ts(region_id, self_safe_ts, leader_safe_ts) + } + } + pub fn shutdown(&self) { for entry in &self.registry.admin_observers { entry.observer.inner().stop(); @@ -564,7 +873,10 @@ mod tests { }; use tikv_util::box_err; - use crate::coprocessor::*; + use crate::{ + coprocessor::{dispatcher::BoxUpdateSafeTsObserver, *}, + store::{SnapKey, Snapshot}, + }; #[derive(Clone, Default)] struct TestCoprocessor { @@ -573,6 +885,33 @@ mod tests { return_err: Arc, } + enum ObserverIndex { + PreProposeAdmin = 1, + PreApplyAdmin = 2, + PostApplyAdmin = 3, + PreProposeQuery = 4, + PreApplyQuery = 5, + PostApplyQuery = 6, + OnRoleChange = 7, + OnRegionChanged = 8, + ApplyPlainKvs = 9, + ApplySst = 10, + OnFlushAppliedCmdBatch = 13, + OnEmptyCmd = 14, + PreExecQuery = 15, + PreExecAdmin = 16, + PostExecQuery = 17, + PostExecAdmin = 18, + OnComputeEngineSize = 19, + PreApplySnapshot = 20, + PostApplySnapshot = 21, + ShouldPreApplySnapshot = 22, + OnUpdateSafeTs = 23, + PrePersist = 24, + PreWriteApplyState = 25, + OnRaftMessage = 26, + } + impl Coprocessor for TestCoprocessor {} impl AdminObserver for TestCoprocessor { @@ -581,7 +920,8 @@ mod tests { ctx: &mut ObserverContext<'_>, _: &mut AdminRequest, ) -> Result<()> { - self.called.fetch_add(1, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreProposeAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); if self.return_err.load(Ordering::SeqCst) { return Err(box_err!("error")); @@ -590,14 +930,43 @@ mod tests { } fn pre_apply_admin(&self, ctx: &mut ObserverContext<'_>, _: &AdminRequest) { - self.called.fetch_add(2, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreApplyAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } fn post_apply_admin(&self, ctx: &mut ObserverContext<'_>, _: &AdminResponse) { - self.called.fetch_add(3, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PostApplyAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_exec_admin( + &self, + ctx: &mut ObserverContext<'_>, + _: &AdminRequest, + _: u64, + _: u64, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PreExecAdmin as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } + + fn post_exec_admin( + &self, + ctx: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PostExecAdmin as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } } impl QueryObserver for TestCoprocessor { @@ -606,7 +975,8 @@ mod tests { ctx: &mut ObserverContext<'_>, _: &mut Vec, ) -> Result<()> { - self.called.fetch_add(4, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreProposeQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); if self.return_err.load(Ordering::SeqCst) { return Err(box_err!("error")); @@ -615,19 +985,64 @@ mod tests { } fn pre_apply_query(&self, ctx: &mut ObserverContext<'_>, _: &[Request]) { - self.called.fetch_add(5, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreApplyQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } fn post_apply_query(&self, ctx: &mut ObserverContext<'_>, _: &Cmd) { - self.called.fetch_add(6, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PostApplyQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_exec_query( + &self, + ctx: &mut ObserverContext<'_>, + _: &[Request], + _: u64, + _: u64, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PreExecQuery as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } + + fn on_empty_cmd(&self, ctx: &mut ObserverContext<'_>, _index: u64, _term: u64) { + self.called + .fetch_add(ObserverIndex::OnEmptyCmd as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } + + fn post_exec_query( + &self, + ctx: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PostExecQuery as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } + } + + impl PdTaskObserver for TestCoprocessor { + fn on_compute_engine_size(&self, _: &mut Option) { + self.called.fetch_add( + ObserverIndex::OnComputeEngineSize as usize, + Ordering::SeqCst, + ); + } } impl RoleObserver for TestCoprocessor { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, _: &RoleChange) { - self.called.fetch_add(7, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::OnRoleChange as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } } @@ -639,9 +1054,29 @@ mod tests { _: RegionChangeEvent, _: StateRole, ) { - self.called.fetch_add(8, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::OnRegionChanged as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_persist( + &self, + ctx: &mut ObserverContext<'_>, + _: bool, + _: Option<&RaftCmdRequest>, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PrePersist as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + true + } + + fn pre_write_apply_state(&self, ctx: &mut ObserverContext<'_>) -> bool { + self.called + .fetch_add(ObserverIndex::PreWriteApplyState as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + true + } } impl ApplySnapshotObserver for TestCoprocessor { @@ -651,14 +1086,48 @@ mod tests { _: CfName, _: &[(Vec, Vec)], ) { - self.called.fetch_add(9, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::ApplyPlainKvs as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } fn apply_sst(&self, ctx: &mut ObserverContext<'_>, _: CfName, _: &str) { - self.called.fetch_add(10, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::ApplySst as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_apply_snapshot( + &self, + ctx: &mut ObserverContext<'_>, + _: u64, + _: &SnapKey, + _: Option<&Snapshot>, + ) { + self.called + .fetch_add(ObserverIndex::PreApplySnapshot as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } + + fn post_apply_snapshot( + &self, + ctx: &mut ObserverContext<'_>, + _: u64, + _: &crate::store::SnapKey, + _: Option<&Snapshot>, + ) { + self.called + .fetch_add(ObserverIndex::PostApplySnapshot as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } + + fn should_pre_apply_snapshot(&self) -> bool { + self.called.fetch_add( + ObserverIndex::ShouldPreApplySnapshot as usize, + Ordering::SeqCst, + ); + false + } } impl CmdObserver for TestCoprocessor { @@ -668,11 +1137,29 @@ mod tests { _: &mut Vec, _: &PanicEngine, ) { - self.called.fetch_add(13, Ordering::SeqCst); + self.called.fetch_add( + ObserverIndex::OnFlushAppliedCmdBatch as usize, + Ordering::SeqCst, + ); } fn on_applied_current_term(&self, _: StateRole, _: &Region) {} } + impl UpdateSafeTsObserver for TestCoprocessor { + fn on_update_safe_ts(&self, _: u64, _: u64, _: u64) { + self.called + .fetch_add(ObserverIndex::OnUpdateSafeTs as usize, Ordering::SeqCst); + } + } + + impl MessageObserver for TestCoprocessor { + fn on_raft_message(&self, _: &RaftMessage) -> bool { + self.called + .fetch_add(ObserverIndex::OnRaftMessage as usize, Ordering::SeqCst); + true + } + } + macro_rules! assert_all { ($target:expr, $expect:expr) => {{ for (c, e) in ($target).iter().zip($expect) { @@ -699,44 +1186,62 @@ mod tests { .register_query_observer(1, BoxQueryObserver::new(ob.clone())); host.registry .register_apply_snapshot_observer(1, BoxApplySnapshotObserver::new(ob.clone())); + host.registry + .register_pd_task_observer(1, BoxPdTaskObserver::new(ob.clone())); host.registry .register_role_observer(1, BoxRoleObserver::new(ob.clone())); host.registry .register_region_change_observer(1, BoxRegionChangeObserver::new(ob.clone())); host.registry .register_cmd_observer(1, BoxCmdObserver::new(ob.clone())); + host.registry + .register_update_safe_ts_observer(1, BoxUpdateSafeTsObserver::new(ob.clone())); + host.registry + .register_message_observer(1, BoxMessageObserver::new(ob.clone())); + + let mut index: usize = 0; let region = Region::default(); let mut admin_req = RaftCmdRequest::default(); admin_req.set_admin_request(AdminRequest::default()); host.pre_propose(®ion, &mut admin_req).unwrap(); - assert_all!([&ob.called], &[1]); + index += ObserverIndex::PreProposeAdmin as usize; + assert_all!([&ob.called], &[index]); host.pre_apply(®ion, &admin_req); - assert_all!([&ob.called], &[3]); + index += ObserverIndex::PreApplyAdmin as usize; + assert_all!([&ob.called], &[index]); let mut admin_resp = RaftCmdResponse::default(); admin_resp.set_admin_response(AdminResponse::default()); - host.post_apply(®ion, &Cmd::new(0, admin_req, admin_resp)); - assert_all!([&ob.called], &[6]); + host.post_apply(®ion, &Cmd::new(0, 0, admin_req, admin_resp)); + index += ObserverIndex::PostApplyAdmin as usize; + assert_all!([&ob.called], &[index]); let mut query_req = RaftCmdRequest::default(); query_req.set_requests(vec![Request::default()].into()); host.pre_propose(®ion, &mut query_req).unwrap(); - assert_all!([&ob.called], &[10]); + index += ObserverIndex::PreProposeQuery as usize; + assert_all!([&ob.called], &[index]); + index += ObserverIndex::PreApplyQuery as usize; host.pre_apply(®ion, &query_req); - assert_all!([&ob.called], &[15]); + assert_all!([&ob.called], &[index]); let query_resp = RaftCmdResponse::default(); - host.post_apply(®ion, &Cmd::new(0, query_req, query_resp)); - assert_all!([&ob.called], &[21]); + host.post_apply(®ion, &Cmd::new(0, 0, query_req, query_resp)); + index += ObserverIndex::PostApplyQuery as usize; + assert_all!([&ob.called], &[index]); host.on_role_change(®ion, RoleChange::new(StateRole::Leader)); - assert_all!([&ob.called], &[28]); + index += ObserverIndex::OnRoleChange as usize; + assert_all!([&ob.called], &[index]); host.on_region_changed(®ion, RegionChangeEvent::Create, StateRole::Follower); - assert_all!([&ob.called], &[36]); + index += ObserverIndex::OnRegionChanged as usize; + assert_all!([&ob.called], &[index]); host.post_apply_plain_kvs_from_snapshot(®ion, "default", &[]); - assert_all!([&ob.called], &[45]); + index += ObserverIndex::ApplyPlainKvs as usize; + assert_all!([&ob.called], &[index]); host.post_apply_sst_from_snapshot(®ion, "default", ""); - assert_all!([&ob.called], &[55]); + index += ObserverIndex::ApplySst as usize; + assert_all!([&ob.called], &[index]); let observe_info = CmdObserveInfo::from_handle( ObserveHandle::new(), @@ -746,8 +1251,72 @@ mod tests { let mut cb = CmdBatch::new(&observe_info, 0); cb.push(&observe_info, 0, Cmd::default()); host.on_flush_applied_cmd_batch(cb.level, vec![cb], &PanicEngine); - // `post_apply` + `on_flush_applied_cmd_batch` => 13 + 6 = 19 - assert_all!([&ob.called], &[74]); + index += ObserverIndex::PostApplyQuery as usize; + index += ObserverIndex::OnFlushAppliedCmdBatch as usize; + assert_all!([&ob.called], &[index]); + + let mut empty_req = RaftCmdRequest::default(); + empty_req.set_requests(vec![Request::default()].into()); + host.on_empty_cmd(®ion, 0, 0); + index += ObserverIndex::OnEmptyCmd as usize; + assert_all!([&ob.called], &[index]); + + let mut query_req = RaftCmdRequest::default(); + query_req.set_requests(vec![Request::default()].into()); + host.pre_exec(®ion, &query_req, 0, 0); + index += ObserverIndex::PreExecQuery as usize; + assert_all!([&ob.called], &[index]); + + let mut admin_req = RaftCmdRequest::default(); + admin_req.set_admin_request(AdminRequest::default()); + host.pre_exec(®ion, &admin_req, 0, 0); + index += ObserverIndex::PreExecAdmin as usize; + assert_all!([&ob.called], &[index]); + + host.on_compute_engine_size(); + index += ObserverIndex::OnComputeEngineSize as usize; + assert_all!([&ob.called], &[index]); + + let mut pending_handle_ssts = None; + let mut delete_ssts = vec![]; + let mut pending_delete_ssts = vec![]; + let mut info = ApplyCtxInfo { + pending_handle_ssts: &mut pending_handle_ssts, + pending_delete_ssts: &mut pending_delete_ssts, + delete_ssts: &mut delete_ssts, + }; + let apply_state = RaftApplyState::default(); + let region_state = RegionState::default(); + let cmd = Cmd::default(); + host.post_exec(®ion, &cmd, &apply_state, ®ion_state, &mut info); + index += ObserverIndex::PostExecQuery as usize; + assert_all!([&ob.called], &[index]); + + let key = SnapKey::new(region.get_id(), 1, 1); + host.pre_apply_snapshot(®ion, 0, &key, None); + index += ObserverIndex::PreApplySnapshot as usize; + assert_all!([&ob.called], &[index]); + + host.post_apply_snapshot(®ion, 0, &key, None); + index += ObserverIndex::PostApplySnapshot as usize; + assert_all!([&ob.called], &[index]); + + host.should_pre_apply_snapshot(); + index += ObserverIndex::ShouldPreApplySnapshot as usize; + assert_all!([&ob.called], &[index]); + + host.on_update_safe_ts(1, 1, 1); + index += ObserverIndex::OnUpdateSafeTs as usize; + assert_all!([&ob.called], &[index]); + + host.pre_write_apply_state(®ion); + index += ObserverIndex::PreWriteApplyState as usize; + assert_all!([&ob.called], &[index]); + + let msg = RaftMessage::default(); + host.on_raft_message(&msg); + index += ObserverIndex::OnRaftMessage as usize; + assert_all!([&ob.called], &[index]); } #[test] @@ -788,7 +1357,7 @@ mod tests { host.pre_apply(®ion, &req); assert_all!([&ob1.called, &ob2.called], &[0, base_score * 2 + 3]); - host.post_apply(®ion, &Cmd::new(0, req.clone(), resp.clone())); + host.post_apply(®ion, &Cmd::new(0, 0, req.clone(), resp.clone())); assert_all!([&ob1.called, &ob2.called], &[0, base_score * 3 + 6]); set_all!(&[&ob2.bypass], false); diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index a9772d948ed..f5bdd8664e6 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -9,11 +9,12 @@ use std::{ vec::IntoIter, }; -use engine_traits::CfName; +use engine_traits::{CfName, SstMetaInfo}; use kvproto::{ metapb::Region, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request}, + raft_serverpb::RaftApplyState, }; use raft::{eraftpb, StateRole}; @@ -25,14 +26,16 @@ mod metrics; pub mod region_info_accessor; mod split_check; pub mod split_observer; +use kvproto::raft_serverpb::RaftMessage; pub use self::{ config::{Config, ConsistencyCheckMethod}, consistency_check::{ConsistencyCheckObserver, Raw as RawConsistencyCheckObserver}, dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, - BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, BoxSplitCheckObserver, - CoprocessorHost, Registry, + BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, + BoxRoleObserver, BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, + StoreHandle, }, error::{Error, Result}, region_info_accessor::{ @@ -74,6 +77,21 @@ impl<'a> ObserverContext<'a> { } } +/// Context of a region provided for observers. +#[derive(Default, Clone)] +pub struct RegionState { + pub peer_id: u64, + pub pending_remove: bool, + pub modified_region: Option, +} + +/// Context for exec observers of mutation to be applied to ApplyContext. +pub struct ApplyCtxInfo<'a> { + pub pending_handle_ssts: &'a mut Option>, + pub delete_ssts: &'a mut Vec, + pub pending_delete_ssts: &'a mut Vec, +} + pub trait AdminObserver: Coprocessor { /// Hook to call before proposing admin request. fn pre_propose_admin(&self, _: &mut ObserverContext<'_>, _: &mut AdminRequest) -> Result<()> { @@ -86,9 +104,39 @@ pub trait AdminObserver: Coprocessor { /// Hook to call after applying admin request. /// For now, the `region` in `ObserverContext` is an empty region. fn post_apply_admin(&self, _: &mut ObserverContext<'_>, _: &AdminResponse) {} + + /// Hook before exec admin request, returns whether we should skip this + /// admin. + fn pre_exec_admin( + &self, + _: &mut ObserverContext<'_>, + _: &AdminRequest, + _: u64, + _: u64, + ) -> bool { + false + } + + /// Hook to call immediately after exec command + /// Will be a special persistence after this exec if a observer returns + /// true. + fn post_exec_admin( + &self, + _: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + false + } } pub trait QueryObserver: Coprocessor { + /// Hook when observe applying empty cmd, probably caused by leadership + /// change. + fn on_empty_cmd(&self, _: &mut ObserverContext<'_>, _index: u64, _term: u64) {} + /// Hook to call before proposing write request. /// /// We don't propose read request, hence there is no hook for it yet. @@ -102,17 +150,66 @@ pub trait QueryObserver: Coprocessor { /// Hook to call after applying write request. /// For now, the `region` in `ObserverContext` is an empty region. fn post_apply_query(&self, _: &mut ObserverContext<'_>, _: &Cmd) {} + + /// Hook before exec write request, returns whether we should skip this + /// write. + fn pre_exec_query(&self, _: &mut ObserverContext<'_>, _: &[Request], _: u64, _: u64) -> bool { + false + } + + /// Hook to call immediately after exec command. + /// Will be a special persistence after this exec if a observer returns + /// true. + fn post_exec_query( + &self, + _: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + false + } } pub trait ApplySnapshotObserver: Coprocessor { /// Hook to call after applying key from plain file. - /// This may be invoked multiple times for each plain file, and each time a batch of key-value - /// pairs will be passed to the function. + /// This may be invoked multiple times for each plain file, and each time a + /// batch of key-value pairs will be passed to the function. fn apply_plain_kvs(&self, _: &mut ObserverContext<'_>, _: CfName, _: &[(Vec, Vec)]) {} - /// Hook to call after applying sst file. Currently the content of the snapshot can't be - /// passed to the observer. + /// Hook to call after applying sst file. Currently the content of the + /// snapshot can't be passed to the observer. fn apply_sst(&self, _: &mut ObserverContext<'_>, _: CfName, _path: &str) {} + + /// Hook when receiving Task::Apply. + /// Should pass valid snapshot, the option is only for testing. + /// Notice that we can call `pre_apply_snapshot` to multiple snapshots at + /// the same time. + fn pre_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + _peer_id: u64, + _: &crate::store::SnapKey, + _: Option<&crate::store::Snapshot>, + ) { + } + + /// Hook when the whole snapshot is applied. + /// Should pass valid snapshot, the option is only for testing. + fn post_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + _: u64, + _: &crate::store::SnapKey, + _snapshot: Option<&crate::store::Snapshot>, + ) { + } + + /// We call pre_apply_snapshot only when one of the observer returns true. + fn should_pre_apply_snapshot(&self) -> bool { + false + } } /// SplitChecker is invoked during a split check scan, and decides to use @@ -148,6 +245,24 @@ pub trait SplitCheckObserver: Coprocessor { ); } +/// Describes size information about all stores. +/// There is guarantee that capacity >= used + avail. +/// since some space can be reserved. +#[derive(Debug, Default)] +pub struct StoreSizeInfo { + /// The capacity of the store. + pub capacity: u64, + /// Size of actual data. + pub used: u64, + /// Available space that can be written with actual data. + pub avail: u64, +} + +pub trait PdTaskObserver: Coprocessor { + /// Compute capacity/used/available size of this store. + fn on_compute_engine_size(&self, _: &mut Option) {} +} + pub struct RoleChange { pub state: StateRole, pub leader_id: u64, @@ -155,15 +270,20 @@ pub struct RoleChange { pub prev_lead_transferee: u64, /// Which peer is voted by itself. pub vote: u64, + pub initialized: bool, + pub peer_id: u64, } impl RoleChange { + #[cfg(any(test, feature = "testexport"))] pub fn new(state: StateRole) -> Self { RoleChange { state, leader_id: raft::INVALID_ID, prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, + initialized: true, + peer_id: raft::INVALID_ID, } } } @@ -172,8 +292,8 @@ pub trait RoleObserver: Coprocessor { /// Hook to call when role of a peer changes. /// /// Please note that, this hook is not called at realtime. There maybe a - /// situation that the hook is not called yet, however the role of some peers - /// have changed. + /// situation that the hook is not called yet, however the role of some + /// peers have changed. fn on_role_change(&self, _: &mut ObserverContext<'_>, _: &RoleChange) {} } @@ -184,6 +304,7 @@ pub enum RegionChangeReason { PrepareMerge, CommitMerge, RollbackMerge, + SwitchWitness, } #[derive(Clone, Copy, Debug, PartialEq)] @@ -197,19 +318,46 @@ pub enum RegionChangeEvent { pub trait RegionChangeObserver: Coprocessor { /// Hook to call when a region changed on this TiKV fn on_region_changed(&self, _: &mut ObserverContext<'_>, _: RegionChangeEvent, _: StateRole) {} + + /// Should be called everytime before we write a WriteBatch into + /// KvEngine. Returns false if we can't commit at this time. + fn pre_persist( + &self, + _: &mut ObserverContext<'_>, + _is_finished: bool, + _cmd: Option<&RaftCmdRequest>, + ) -> bool { + true + } + + /// Should be called everytime before we want to write apply state when + /// applying. Return a bool which indicates whether we can actually do + /// this write. + fn pre_write_apply_state(&self, _: &mut ObserverContext<'_>) -> bool { + true + } +} + +pub trait MessageObserver: Coprocessor { + /// Returns false if the message should not be stepped later. + fn on_raft_message(&self, _: &RaftMessage) -> bool { + true + } } #[derive(Clone, Debug, Default)] pub struct Cmd { pub index: u64, + pub term: u64, pub request: RaftCmdRequest, pub response: RaftCmdResponse, } impl Cmd { - pub fn new(index: u64, request: RaftCmdRequest, response: RaftCmdResponse) -> Cmd { + pub fn new(index: u64, term: u64, request: RaftCmdRequest, response: RaftCmdResponse) -> Cmd { Cmd { index, + term, request, response, } @@ -220,33 +368,34 @@ static OBSERVE_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier for checking stale observed commands. #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct ObserveID(usize); +pub struct ObserveId(usize); -impl ObserveID { - pub fn new() -> ObserveID { - ObserveID(OBSERVE_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) +impl ObserveId { + pub fn new() -> ObserveId { + ObserveId(OBSERVE_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) } } -/// ObserveHandle is the status of a term of observing, it contains the `ObserveID` -/// and the `observing` flag indicate whether the observing is ongoing +/// ObserveHandle is the status of a term of observing, it contains the +/// `ObserveId` and the `observing` flag indicate whether the observing is +/// ongoing #[derive(Clone, Default, Debug)] pub struct ObserveHandle { - pub id: ObserveID, + pub id: ObserveId, observing: Arc, } impl ObserveHandle { pub fn new() -> ObserveHandle { ObserveHandle { - id: ObserveID::new(), + id: ObserveId::new(), observing: Arc::new(AtomicBool::new(true)), } } pub fn with_id(id: usize) -> ObserveHandle { ObserveHandle { - id: ObserveID(id), + id: ObserveId(id), observing: Arc::new(AtomicBool::new(true)), } } @@ -280,15 +429,16 @@ impl CmdObserveInfo { } } - /// Get the max observe level of the observer info by the observers currently registered. - /// Currently, TiKV uses a static strategy for managing observers. - /// There are a fixed number type of observer being registered in each TiKV node, - /// and normally, observers are singleton. + /// Get the max observe level of the observer info by the observers + /// currently registered. Currently, TiKV uses a static strategy for + /// managing observers. There are a fixed number type of observer being + /// registered in each TiKV node, and normally, observers are singleton. /// The types are: /// CDC: Observer supports the `ChangeData` service. /// PiTR: Observer supports the `backup-log` function. - /// RTS: Observer supports the `resolved-ts` advancing (and follower read, etc.). - fn observe_level(&self) -> ObserveLevel { + /// RTS: Observer supports the `resolved-ts` advancing (and follower read, + /// etc.). + pub fn observe_level(&self) -> ObserveLevel { let cdc = if self.cdc_id.is_observing() { // `cdc` observe all data ObserveLevel::All @@ -335,9 +485,9 @@ pub enum ObserveLevel { #[derive(Clone, Debug)] pub struct CmdBatch { pub level: ObserveLevel, - pub cdc_id: ObserveID, - pub rts_id: ObserveID, - pub pitr_id: ObserveID, + pub cdc_id: ObserveId, + pub rts_id: ObserveId, + pub pitr_id: ObserveId, pub region_id: u64, pub cmds: Vec, } @@ -362,6 +512,19 @@ impl CmdBatch { self.cmds.push(cmd) } + pub fn extend>( + &mut self, + observe_info: &CmdObserveInfo, + region_id: u64, + cmds: I, + ) { + assert_eq!(region_id, self.region_id); + assert_eq!(observe_info.cdc_id.id, self.cdc_id); + assert_eq!(observe_info.rts_id.id, self.rts_id); + assert_eq!(observe_info.pitr_id.id, self.pitr_id); + self.cmds.extend(cmds) + } + pub fn into_iter(self, region_id: u64) -> IntoIter { assert_eq!(region_id, self.region_id); self.cmds.into_iter() @@ -403,7 +566,8 @@ pub trait CmdObserver: Coprocessor { cmd_batches: &mut Vec, engine: &E, ); - // TODO: maybe shoulde move `on_applied_current_term` to a separated `Coprocessor` + // TODO: maybe should move `on_applied_current_term` to a separated + // `Coprocessor` /// Hook to call at the first time the leader applied on its term fn on_applied_current_term(&self, role: StateRole, region: &Region); } @@ -413,6 +577,11 @@ pub trait ReadIndexObserver: Coprocessor { fn on_step(&self, _msg: &mut eraftpb::Message, _role: StateRole) {} } +pub trait UpdateSafeTsObserver: Coprocessor { + /// Hook after update self safe_ts and received leader safe_ts. + fn on_update_safe_ts(&self, _: u64, _: u64, _: u64) {} +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index c38f1161a1f..37403310baf 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -6,12 +6,13 @@ use std::{ Bound::{Excluded, Unbounded}, }, fmt::{Display, Formatter, Result as FmtResult}, - sync::{mpsc, Mutex}, + sync::{mpsc, Arc, Mutex, RwLock}, time::Duration, }; -use collections::HashMap; +use collections::{HashMap, HashSet}; use engine_traits::KvEngine; +use itertools::Itertools; use kvproto::metapb::Region; use raft::StateRole; use tikv_util::{ @@ -24,29 +25,47 @@ use super::{ ObserverContext, RegionChangeEvent, RegionChangeObserver, Result, RoleChange, RoleObserver, }; -/// `RegionInfoAccessor` is used to collect all regions' information on this TiKV into a collection -/// so that other parts of TiKV can get region information from it. It registers a observer to -/// raftstore, which is named `RegionEventListener`. When the events that we are interested in -/// happen (such as creating and deleting regions), `RegionEventListener` simply sends the events -/// through a channel. -/// In the mean time, `RegionCollector` keeps fetching messages from the channel, and mutates -/// the collection according to the messages. When an accessor method of `RegionInfoAccessor` is -/// called, it also simply sends a message to `RegionCollector`, and the result will be sent -/// back through as soon as it's finished. -/// In fact, the channel mentioned above is actually a `util::worker::Worker`. +/// `RegionInfoAccessor` is used to collect all regions' information on this +/// TiKV into a collection so that other parts of TiKV can get region +/// information from it. It registers a observer to raftstore, which is named +/// `RegionEventListener`. When the events that we are interested in happen +/// (such as creating and deleting regions), `RegionEventListener` simply +/// sends the events through a channel. +/// In the mean time, `RegionCollector` keeps fetching messages from the +/// channel, and mutates the collection according to the messages. When an +/// accessor method of `RegionInfoAccessor` is called, it also simply sends a +/// message to `RegionCollector`, and the result will be sent back through as +/// soon as it's finished. In fact, the channel mentioned above is actually a +/// `util::worker::Worker`. /// -/// **Caution**: Note that the information in `RegionInfoAccessor` is not perfectly precise. Some -/// regions may be temporarily absent while merging or splitting is in progress. Also, -/// `RegionInfoAccessor`'s information may slightly lag the actual regions on the TiKV. +/// **Caution**: Note that the information in `RegionInfoAccessor` is not +/// perfectly precise. Some regions may be temporarily absent while merging or +/// splitting is in progress. Also, `RegionInfoAccessor`'s information may +/// slightly lag the actual regions on the TiKV. /// `RaftStoreEvent` Represents events dispatched from raftstore coprocessor. #[derive(Debug)] pub enum RaftStoreEvent { - CreateRegion { region: Region, role: StateRole }, - UpdateRegion { region: Region, role: StateRole }, - DestroyRegion { region: Region }, - RoleChange { region: Region, role: StateRole }, - UpdateRegionBuckets { region: Region, buckets: usize }, + CreateRegion { + region: Region, + role: StateRole, + }, + UpdateRegion { + region: Region, + role: StateRole, + }, + DestroyRegion { + region: Region, + }, + RoleChange { + region: Region, + role: StateRole, + initialized: bool, + }, + UpdateRegionBuckets { + region: Region, + buckets: usize, + }, } impl RaftStoreEvent { @@ -81,9 +100,10 @@ impl RegionInfo { type RegionsMap = HashMap; type RegionRangesMap = BTreeMap; -// RangeKey is a wrapper used to unify the comparsion between region start key -// and region end key. Region end key is special as empty stands for the infinite, -// so we need to take special care for cases where the end key is empty. +// RangeKey is a wrapper used to unify the comparison between region start key +// and region end key. Region end key is special as empty stands for the +// infinite, so we need to take special care for cases where the end key is +// empty. #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] pub enum RangeKey { Finite(Vec), @@ -107,8 +127,8 @@ impl RangeKey { pub type Callback = Box; pub type SeekRegionCallback = Box) + Send>; -/// `RegionInfoAccessor` has its own thread. Queries and updates are done by sending commands to the -/// thread. +/// `RegionInfoAccessor` has its own thread. Queries and updates are done by +/// sending commands to the thread. pub enum RegionInfoQuery { RaftStoreEvent(RaftStoreEvent), SeekRegion { @@ -151,8 +171,8 @@ impl Display for RegionInfoQuery { } } -/// `RegionEventListener` implements observer traits. It simply send the events that we are interested in -/// through the `scheduler`. +/// `RegionEventListener` implements observer traits. It simply send the events +/// that we are interested in through the `scheduler`. #[derive(Clone)] struct RegionEventListener { scheduler: Scheduler, @@ -186,7 +206,11 @@ impl RoleObserver for RegionEventListener { fn on_role_change(&self, context: &mut ObserverContext<'_>, role_change: &RoleChange) { let region = context.region().clone(); let role = role_change.state; - let event = RaftStoreEvent::RoleChange { region, role }; + let event = RaftStoreEvent::RoleChange { + region, + role, + initialized: role_change.initialized, + }; self.scheduler .schedule(RegionInfoQuery::RaftStoreEvent(event)) .unwrap(); @@ -206,19 +230,23 @@ fn register_region_event_listener( .register_region_change_observer(1, BoxRegionChangeObserver::new(listener)); } -/// `RegionCollector` is the place where we hold all region information we collected, and the -/// underlying runner of `RegionInfoAccessor`. It listens on events sent by the `RegionEventListener` and -/// keeps information of all regions. Role of each region are also tracked. +/// `RegionCollector` is the place where we hold all region information we +/// collected, and the underlying runner of `RegionInfoAccessor`. It listens on +/// events sent by the `RegionEventListener` and keeps information of all +/// regions. Role of each region are also tracked. pub struct RegionCollector { // HashMap: region_id -> (Region, State) regions: RegionsMap, // BTreeMap: data_end_key -> region_id region_ranges: RegionRangesMap, + + region_leaders: Arc>>, } impl RegionCollector { - pub fn new() -> Self { + pub fn new(region_leaders: Arc>>) -> Self { Self { + region_leaders, regions: HashMap::default(), region_ranges: BTreeMap::default(), } @@ -277,9 +305,10 @@ impl RegionCollector { } fn handle_create_region(&mut self, region: Region, role: StateRole) { - // During tests, we found that the `Create` event may arrive multiple times. And when we - // receive an `Update` message, the region may have been deleted for some reason. So we - // handle it according to whether the region exists in the collection. + // During tests, we found that the `Create` event may arrive multiple times. And + // when we receive an `Update` message, the region may have been deleted for + // some reason. So we handle it according to whether the region exists in the + // collection. if self.regions.contains_key(®ion.get_id()) { info!( "trying to create region but it already exists, try to update it"; @@ -324,18 +353,28 @@ impl RegionCollector { let removed_id = self.region_ranges.remove(&end_key).unwrap(); assert_eq!(removed_id, region.get_id()); } else { - // It's possible that the region is already removed because it's end_key is used by - // another newer region. + // It's possible that the region is already removed because it's end_key is used + // by another newer region. debug!( "destroying region but it doesn't exist"; "region_id" => region.get_id(), ) } + self.region_leaders + .write() + .unwrap() + .remove(®ion.get_id()); } fn handle_role_change(&mut self, region: Region, new_role: StateRole) { let region_id = region.get_id(); + if new_role == StateRole::Leader { + self.region_leaders.write().unwrap().insert(region_id); + } else { + self.region_leaders.write().unwrap().remove(®ion_id); + } + if let Some(r) = self.regions.get_mut(®ion_id) { r.role = new_role; return; @@ -348,29 +387,33 @@ impl RegionCollector { self.create_region(region, new_role); } - /// Determines whether `region_to_check`'s epoch is stale compared to `current`'s epoch + /// Determines whether `region_to_check`'s epoch is stale compared to + /// `current`'s epoch #[inline] fn is_region_epoch_stale(&self, region_to_check: &Region, current: &Region) -> bool { let epoch = region_to_check.get_region_epoch(); let current_epoch = current.get_region_epoch(); // Only compare conf_ver when they have the same version. - // When a region A merges region B, region B may have a greater conf_ver. Then, the new - // merged region meta has larger version but smaller conf_ver than the original B's. In this - // case, the incoming region meta has a smaller conf_ver but is not stale. + // When a region A merges region B, region B may have a greater conf_ver. Then, + // the new merged region meta has larger version but smaller conf_ver than the + // original B's. In this case, the incoming region meta has a smaller conf_ver + // but is not stale. epoch.get_version() < current_epoch.get_version() || (epoch.get_version() == current_epoch.get_version() && epoch.get_conf_ver() < current_epoch.get_conf_ver()) } - /// For all regions whose range overlaps with the given `region` or region_id is the same as - /// `region`'s, checks whether the given `region`'s epoch is not older than theirs. + /// For all regions whose range overlaps with the given `region` or + /// region_id is the same as `region`'s, checks whether the given + /// `region`'s epoch is not older than theirs. /// - /// Returns false if the given `region` is stale, which means, at least one region above has - /// newer epoch. - /// If the given `region` is not stale, all other regions in the collection that overlaps with - /// the given `region` must be stale. Returns true in this case, and if `clear_regions_in_range` - /// is true, those out-of-date regions will be removed from the collection. + /// Returns false if the given `region` is stale, which means, at least one + /// region above has newer epoch. + /// If the given `region` is not stale, all other regions in the collection + /// that overlaps with the given `region` must be stale. Returns true in + /// this case, and if `clear_regions_in_range` is true, those out-of-date + /// regions will be removed from the collection. fn check_region_range(&mut self, region: &Region, clear_regions_in_range: bool) -> bool { if let Some(region_with_same_id) = self.regions.get(®ion.get_id()) { if self.is_region_epoch_stale(region, ®ion_with_same_id.region) { @@ -402,7 +445,10 @@ impl RegionCollector { // They are impossible to equal, or they cannot overlap. assert_ne!( region.get_region_epoch().get_version(), - current_region.get_region_epoch().get_version() + current_region.get_region_epoch().get_version(), + "{:?} vs {:?}", + region, + current_region, ); // Remove it since it's a out-of-date region info. if clear_regions_in_range { @@ -458,14 +504,18 @@ impl RegionCollector { let region = event.get_region(); if region.get_region_epoch().get_version() == 0 { // Ignore messages with version 0. - // In raftstore `Peer::replicate`, the region meta's fields are all initialized with - // default value except region_id. So if there is more than one region replicating - // when the TiKV just starts, the assertion "Any two region with different ids and - // overlapping ranges must have different version" fails. + // In raftstore `Peer::replicate`, the region meta's fields are all initialized + // with default value except region_id. So if there is more than one region + // replicating when the TiKV just starts, the assertion "Any two region with + // different ids and overlapping ranges must have different version" fails. // // Since 0 is actually an invalid value of version, we can simply ignore the - // messages with version 0. The region will be created later when the region's epoch - // is properly set and an Update message was sent. + // messages with version 0. The region will be created later when the region's + // epoch is properly set and an Update message was sent. + return; + } + if let RaftStoreEvent::RoleChange { initialized, .. } = &event && !initialized { + // Ignore uninitialized peers. return; } if !self.check_region_range(region, true) { @@ -487,7 +537,7 @@ impl RegionCollector { RaftStoreEvent::DestroyRegion { region } => { self.handle_destroy_region(region); } - RaftStoreEvent::RoleChange { region, role } => { + RaftStoreEvent::RoleChange { region, role, .. } => { self.handle_role_change(region, role); } RaftStoreEvent::UpdateRegionBuckets { region, buckets } => { @@ -497,12 +547,6 @@ impl RegionCollector { } } -impl Default for RegionCollector { - fn default() -> Self { - Self::new() - } -} - impl Runnable for RegionCollector { type Task = RegionInfoQuery; @@ -564,7 +608,8 @@ impl RunnableWithTimer for RegionCollector { } } -/// `RegionInfoAccessor` keeps all region information separately from raftstore itself. +/// `RegionInfoAccessor` keeps all region information separately from raftstore +/// itself. #[derive(Clone)] pub struct RegionInfoAccessor { // We use a dedicated worker for region info accessor. If we later want to share a worker with @@ -574,18 +619,37 @@ pub struct RegionInfoAccessor { // https://github.com/tikv/tikv/issues/9044 worker: Worker, scheduler: Scheduler, + + /// Region leader ids set on the store. + /// + /// Others can access this info directly, such as RaftKV. + region_leaders: Arc>>, } impl RegionInfoAccessor { /// Creates a new `RegionInfoAccessor` and register to `host`. - /// `RegionInfoAccessor` doesn't need, and should not be created more than once. If it's needed - /// in different places, just clone it, and their contents are shared. + /// `RegionInfoAccessor` doesn't need, and should not be created more than + /// once. If it's needed in different places, just clone it, and their + /// contents are shared. pub fn new(host: &mut CoprocessorHost) -> Self { + let region_leaders = Arc::new(RwLock::new(HashSet::default())); let worker = WorkerBuilder::new("region-collector-worker").create(); - let scheduler = worker.start_with_timer("region-collector-worker", RegionCollector::new()); + let scheduler = worker.start_with_timer( + "region-collector-worker", + RegionCollector::new(region_leaders.clone()), + ); register_region_event_listener(host, scheduler.clone()); - Self { worker, scheduler } + Self { + worker, + scheduler, + region_leaders, + } + } + + /// Get a set of region leader ids. + pub fn region_leaders(&self) -> Arc>> { + self.region_leaders.clone() } /// Stops the `RegionInfoAccessor`. It should be stopped after raftstore. @@ -605,8 +669,8 @@ impl RegionInfoAccessor { } pub trait RegionInfoProvider: Send + Sync { - /// Get a iterator of regions that contains `from` or have keys larger than `from`, and invoke - /// the callback to process the result. + /// Get a iterator of regions that contains `from` or have keys larger than + /// `from`, and invoke the callback to process the result. fn seek_region(&self, _from: &[u8], _callback: SeekRegionCallback) -> Result<()> { unimplemented!() } @@ -619,6 +683,10 @@ pub trait RegionInfoProvider: Send + Sync { unimplemented!() } + fn find_region_by_key(&self, _key: &[u8]) -> Result { + unimplemented!() + } + fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> Result> { unimplemented!() } @@ -649,6 +717,27 @@ impl RegionInfoProvider for RegionInfoAccessor { .map_err(|e| box_err!("failed to send request to region collector: {:?}", e)) } + fn find_region_by_key(&self, key: &[u8]) -> Result { + let key_in_vec = key.to_vec(); + let (tx, rx) = mpsc::channel(); + self.seek_region( + key, + Box::new(move |iter| { + if let Some(info) = iter.next() && info.region.get_start_key() <= key_in_vec.as_slice() { + if let Err(e) = tx.send(info.region.clone()) { + warn!("failed to send find_region_by_key result: {:?}", e); + } + } + }), + )?; + rx.recv().map_err(|e| { + box_err!( + "failed to receive find_region_by_key result from region collector: {:?}", + e + ) + }) + } + fn get_regions_in_range(&self, start_key: &[u8], end_key: &[u8]) -> Result> { let (tx, rx) = mpsc::channel(); let msg = RegionInfoQuery::GetRegionsInRange { @@ -675,30 +764,93 @@ impl RegionInfoProvider for RegionInfoAccessor { } // Use in tests only. -pub struct MockRegionInfoProvider(Mutex>); +// Note: The `StateRole` in RegionInfo here should not be used +pub struct MockRegionInfoProvider(Mutex>); impl MockRegionInfoProvider { pub fn new(regions: Vec) -> Self { - MockRegionInfoProvider(Mutex::new(regions)) + MockRegionInfoProvider(Mutex::new( + regions + .into_iter() + .map(|region| RegionInfo::new(region, StateRole::Leader)) + .collect_vec(), + )) } } impl Clone for MockRegionInfoProvider { fn clone(&self) -> Self { - MockRegionInfoProvider::new(self.0.lock().unwrap().clone()) + MockRegionInfoProvider::new( + self.0 + .lock() + .unwrap() + .iter() + .map(|region_info| region_info.region.clone()) + .collect_vec(), + ) } } impl RegionInfoProvider for MockRegionInfoProvider { - fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> Result> { - Ok(self.0.lock().unwrap().clone()) + fn get_regions_in_range(&self, start_key: &[u8], end_key: &[u8]) -> Result> { + let mut regions = Vec::new(); + let (tx, rx) = mpsc::channel(); + let end_key = RangeKey::from_end_key(end_key.to_vec()); + + self.seek_region( + start_key, + Box::new(move |iter| { + for region_info in iter { + if RangeKey::from_start_key(region_info.region.get_start_key().to_vec()) + > end_key + { + continue; + } + tx.send(region_info.region.clone()).unwrap(); + } + }), + )?; + + for region in rx { + regions.push(region); + } + Ok(regions) + } + + fn seek_region(&self, from: &[u8], callback: SeekRegionCallback) -> Result<()> { + let region_infos = self.0.lock().unwrap(); + let mut iter = region_infos.iter().filter(|®ion_info| { + RangeKey::from_end_key(region_info.region.get_end_key().to_vec()) + > RangeKey::from_start_key(from.to_vec()) + }); + callback(&mut iter); + Ok(()) + } + + fn find_region_by_key(&self, key: &[u8]) -> Result { + let region_infos = self.0.lock().unwrap(); + let key = RangeKey::from_start_key(key.to_vec()); + region_infos + .iter() + .find(|region_info| { + RangeKey::from_start_key(region_info.region.get_start_key().to_vec()) <= key + && key < RangeKey::from_end_key(region_info.region.get_end_key().to_vec()) + }) + .map(|region_info| region_info.region.clone()) + .ok_or(box_err!("Not found region containing {:?}", key)) } } #[cfg(test)] mod tests { + use txn_types::Key; + use super::*; + fn new_region_collector() -> RegionCollector { + RegionCollector::new(Arc::new(RwLock::new(HashSet::default()))) + } + fn new_region(id: u64, start_key: &[u8], end_key: &[u8], version: u64) -> Region { let mut region = Region::default(); region.set_id(id); @@ -762,7 +914,8 @@ mod tests { } } - /// Adds a set of regions to an empty collection and check if it's successfully loaded. + /// Adds a set of regions to an empty collection and check if it's + /// successfully loaded. fn must_load_regions(c: &mut RegionCollector, regions: &[Region]) { assert!(c.regions.is_empty()); assert!(c.region_ranges.is_empty()); @@ -819,8 +972,9 @@ mod tests { .get_version(); assert!(region.get_region_epoch().get_version() < version); } - // If end_key is updated and the region_id corresponding to the `old_end_key` doesn't equals - // to `region_id`, it shouldn't be removed since it was used by another region. + // If end_key is updated and the region_id corresponding to the `old_end_key` + // doesn't equals to `region_id`, it shouldn't be removed since it was + // used by another region. if let Some(old_end_key) = old_end_key { if old_end_key.as_slice() != region.get_end_key() { assert!( @@ -849,8 +1003,8 @@ mod tests { c.handle_raftstore_event(RaftStoreEvent::DestroyRegion { region }); assert!(c.regions.get(&id).is_none()); - // If the region_id corresponding to the end_key doesn't equals to `id`, it shouldn't be - // removed since it was used by another region. + // If the region_id corresponding to the end_key doesn't equals to `id`, it + // shouldn't be removed since it was used by another region. if let Some(end_key) = end_key { assert!( c.region_ranges @@ -860,10 +1014,16 @@ mod tests { } } - fn must_change_role(c: &mut RegionCollector, region: &Region, role: StateRole) { + fn must_change_role( + c: &mut RegionCollector, + region: &Region, + role: StateRole, + initialized: bool, + ) { c.handle_raftstore_event(RaftStoreEvent::RoleChange { region: region.clone(), role, + initialized, }); if let Some(r) = c.regions.get(®ion.get_id()) { @@ -896,7 +1056,7 @@ mod tests { #[test] fn test_ignore_invalid_version() { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); c.handle_raftstore_event(RaftStoreEvent::CreateRegion { region: new_region(1, b"k1", b"k3", 0), @@ -909,6 +1069,12 @@ mod tests { c.handle_raftstore_event(RaftStoreEvent::RoleChange { region: new_region(1, b"k1", b"k2", 0), role: StateRole::Leader, + initialized: true, + }); + c.handle_raftstore_event(RaftStoreEvent::RoleChange { + region: new_region(1, b"", b"", 3), + role: StateRole::Leader, + initialized: false, }); check_collection(&c, &[]); @@ -925,7 +1091,7 @@ mod tests { region_with_conf(6, b"k7", b"", 20, 10), ]; - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); must_load_regions(&mut c, regions); assert!(c.check_region_range(®ion_with_conf(1, b"", b"k1", 10, 10), false)); @@ -988,7 +1154,7 @@ mod tests { new_region(6, b"k7", b"", 1), ]; - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); must_load_regions(&mut c, &init_regions); let mut regions: Vec<_> = init_regions .iter() @@ -1019,7 +1185,7 @@ mod tests { check_collection(&c, &[]); // Test that the region with the same id will be kept in the collection - c = RegionCollector::new(); + c = new_region_collector(); must_load_regions(&mut c, &init_regions); c.check_region_range(&new_region(3, b"k1", b"k7", 2), true); @@ -1038,7 +1204,7 @@ mod tests { #[test] fn test_basic_updating() { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ new_region(1, b"", b"k1", 1), new_region(2, b"k1", b"k9", 1), @@ -1070,9 +1236,15 @@ mod tests { &mut c, &new_region(1, b"k0", b"k1", 2), StateRole::Candidate, + true, ); must_create_region(&mut c, &new_region(5, b"k99", b"", 2), StateRole::Follower); - must_change_role(&mut c, &new_region(2, b"k2", b"k8", 2), StateRole::Leader); + must_change_role( + &mut c, + &new_region(2, b"k2", b"k8", 2), + StateRole::Leader, + true, + ); must_update_region(&mut c, &new_region(2, b"k3", b"k7", 3), StateRole::Leader); // test region buckets update must_update_region_buckets(&mut c, &new_region(2, b"k3", b"k7", 3), 4); @@ -1100,12 +1272,13 @@ mod tests { ); } - /// Simulates splitting a region into 3 regions, and the region with old id will be the - /// `derive_index`-th region of them. The events are triggered in order indicated by `seq`. - /// This is to ensure the collection is correct, no matter what the events' order to happen is. + /// Simulates splitting a region into 3 regions, and the region with old id + /// will be the `derive_index`-th region of them. The events are triggered + /// in order indicated by `seq`. This is to ensure the collection is + /// correct, no matter what the events' order to happen is. /// Values in `seq` and of `derive_index` start from 1. fn test_split_impl(derive_index: usize, seq: &[usize]) { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ new_region(1, b"", b"k1", 1), new_region(2, b"k1", b"k9", 1), @@ -1152,13 +1325,13 @@ mod tests { for index in indices { for order in orders { - test_split_impl(*index, *order); + test_split_impl(*index, order.as_slice()); } } } fn test_merge_impl(to_left: bool, update_first: bool) { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ region_with_conf(1, b"", b"k1", 1, 1), region_with_conf(2, b"k1", b"k2", 1, 100), @@ -1202,7 +1375,7 @@ mod tests { #[test] fn test_extreme_cases() { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ new_region(1, b"", b"k1", 1), new_region(2, b"k1", b"k9", 1), @@ -1210,15 +1383,21 @@ mod tests { ]; must_load_regions(&mut c, init_regions); - // While splitting, region 4 created but region 2 still has an `update` event which haven't - // been handled. + // While splitting, region 4 created but region 2 still has an `update` event + // which haven't been handled. must_create_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); must_update_region(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Follower); - must_change_role(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Leader); + must_change_role( + &mut c, + &new_region(2, b"k1", b"k9", 1), + StateRole::Leader, + true, + ); must_update_region(&mut c, &new_region(2, b"k1", b"k5", 2), StateRole::Leader); - // TODO: In fact, region 2's role should be follower. However because it's previous state was - // removed while creating updating region 4, it can't be successfully updated. Fortunately - // this case may hardly happen so it can be fixed later. + // TODO: In fact, region 2's role should be follower. However because it's + // previous state was removed while creating updating region 4, it can't be + // successfully updated. Fortunately this case may hardly happen so it can be + // fixed later. check_collection( &c, &[ @@ -1229,11 +1408,17 @@ mod tests { ], ); - // While merging, region 2 expanded and covered region 4 (and their end key become the same) - // but region 4 still has an `update` event which haven't been handled. + // While merging, region 2 expanded and covered region 4 (and their end key + // become the same) but region 4 still has an `update` event which haven't been + // handled. must_update_region(&mut c, &new_region(2, b"k1", b"k9", 3), StateRole::Leader); must_update_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); - must_change_role(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Leader); + must_change_role( + &mut c, + &new_region(4, b"k5", b"k9", 2), + StateRole::Leader, + true, + ); must_destroy_region(&mut c, new_region(4, b"k5", b"k9", 2)); check_collection( &c, @@ -1244,4 +1429,63 @@ mod tests { ], ); } + + #[test] + fn test_mock_region_info_provider() { + fn init_region(start_key: &[u8], end_key: &[u8], region_id: u64) -> Region { + let start_key = Key::from_encoded(start_key.to_vec()); + let end_key = Key::from_encoded(end_key.to_vec()); + let mut region = Region::default(); + region.set_start_key(start_key.as_encoded().clone()); + region.set_end_key(end_key.as_encoded().clone()); + region.id = region_id; + region + } + + let regions = vec![ + init_region(b"k01", b"k03", 1), + init_region(b"k05", b"k10", 2), + init_region(b"k10", b"k15", 3), + ]; + + let provider = MockRegionInfoProvider::new(regions); + + // Test ranges covering all regions + let regions = provider.get_regions_in_range(b"k01", b"k15").unwrap(); + assert!(regions.len() == 3); + assert!(regions[0].id == 1); + assert!(regions[1].id == 2); + assert!(regions[2].id == 3); + + // Test ranges covering partial regions + let regions = provider.get_regions_in_range(b"k04", b"k10").unwrap(); + assert!(regions.len() == 2); + assert!(regions[0].id == 2); + assert!(regions[1].id == 3); + + // Test seek for all regions + provider + .seek_region( + b"k02", + Box::new(|iter| { + assert!(iter.next().unwrap().region.id == 1); + assert!(iter.next().unwrap().region.id == 2); + assert!(iter.next().unwrap().region.id == 3); + assert!(iter.next().is_none()); + }), + ) + .unwrap(); + + // Test seek for partial regions + provider + .seek_region( + b"k04", + Box::new(|iter| { + assert!(iter.next().unwrap().region.id == 2); + assert!(iter.next().unwrap().region.id == 3); + assert!(iter.next().is_none()); + }), + ) + .unwrap(); + } } diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index 87ee861c95c..1f4527128d8 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -125,7 +125,7 @@ pub fn get_region_approximate_middle( mod tests { use std::{iter, sync::mpsc}; - use engine_test::ctor::{CFOptions, ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{CfOptions, DbOptions}; use engine_traits::{MiscExt, SyncMutable, ALL_CFS, CF_DEFAULT, LARGE_CFS}; use kvproto::{ metapb::{Peer, Region}, @@ -140,23 +140,15 @@ mod tests { *, }; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{BucketRange, CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{BucketRange, SplitCheckRunner, SplitCheckTask}, }; #[test] fn test_split_check() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -197,18 +189,11 @@ mod tests { must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); } - fn test_generate_region_bucket_impl(mvcc: bool) { + #[test] + fn test_split_check_with_key_range() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -219,13 +204,77 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(BUCKET_NUMBER_LIMIT as u64)), - enable_region_bucket: true, - region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() }; let mut runnable = SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); + for i in 0..11 { + let k = format!("{:04}", i).into_bytes(); + let k = keys::data_key(Key::from_raw(&k).as_encoded()); + engine.put_cf(CF_DEFAULT, &k, &k).unwrap(); + // Flush for every key so that we can know the exact middle key. + engine.flush_cf(CF_DEFAULT, true).unwrap(); + } + let start_key = Key::from_raw(b"0000").into_encoded(); + let end_key = Key::from_raw(b"0005").into_encoded(); + runnable.run(SplitCheckTask::split_check_key_range( + region.clone(), + Some(start_key), + Some(end_key), + false, + CheckPolicy::Scan, + None, + )); + let split_key = Key::from_raw(b"0003"); + must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); + let start_key = Key::from_raw(b"0005").into_encoded(); + let end_key = Key::from_raw(b"0010").into_encoded(); + runnable.run(SplitCheckTask::split_check_key_range( + region.clone(), + Some(start_key), + Some(end_key), + false, + CheckPolicy::Scan, + None, + )); + let split_key = Key::from_raw(b"0008"); + must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); + let start_key = Key::from_raw(b"0003").into_encoded(); + let end_key = Key::from_raw(b"0008").into_encoded(); + runnable.run(SplitCheckTask::split_check_key_range( + region.clone(), + Some(start_key), + Some(end_key), + false, + CheckPolicy::Scan, + None, + )); + let split_key = Key::from_raw(b"0006"); + must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); + } + + fn test_generate_region_bucket_impl(mvcc: bool) { + let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); + + let mut region = Region::default(); + region.set_id(1); + region.mut_peers().push(Peer::default()); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(5); + + let (tx, rx) = mpsc::sync_channel(100); + let cfg = Config { + region_split_size: Some(ReadableSize(130_u64)), + enable_region_bucket: Some(true), + region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket + ..Default::default() + }; + let cop_host = CoprocessorHost::new(tx.clone(), cfg); + let mut runnable = SplitCheckRunner::new(engine.clone(), tx, cop_host.clone()); + let key_gen = |k: &[u8], i: u64, mvcc: bool| { if !mvcc { keys::data_key(Key::from_raw(k).as_encoded()) @@ -276,6 +325,9 @@ mod tests { Some(vec![bucket_range]), )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Scan); + must_generate_buckets(&rx, &exp_bucket_keys); // testing split bucket with end key "" @@ -299,6 +351,8 @@ mod tests { CheckPolicy::Scan, Some(vec![bucket_range]), )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Scan); must_generate_buckets(&rx, &exp_bucket_keys); @@ -327,15 +381,7 @@ mod tests { fn test_generate_region_bucket_with_deleting_data() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -345,8 +391,8 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { - region_max_size: Some(ReadableSize(BUCKET_NUMBER_LIMIT as u64)), - enable_region_bucket: true, + region_split_size: Some(ReadableSize(130_u64)), + enable_region_bucket: Some(true), region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() }; @@ -405,15 +451,11 @@ mod tests { )); loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - buckets, - bucket_ranges, - .. - }, - )) = rx.try_recv() + if let Ok(SchedTask::RefreshRegionBuckets { + buckets, + bucket_ranges, + .. + }) = rx.try_recv() { assert_eq!(buckets.len(), bucket_ranges.unwrap().len()); assert_eq!(buckets.len(), 5); @@ -439,13 +481,10 @@ mod tests { .unwrap(); let path = tmp.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); let mut big_value = Vec::with_capacity(256); diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index bc9c847225a..2c0e71dd8cb 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -1,10 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{Arc, Mutex}, -}; - use engine_traits::{KvEngine, Range}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; @@ -19,7 +14,7 @@ use super::{ size::get_approximate_split_keys, Host, }; -use crate::store::{CasualMessage, CasualRouter}; +use crate::coprocessor::dispatcher::StoreHandle; pub struct Checker { max_keys_count: u64, @@ -62,7 +57,8 @@ where if self.current_count > self.split_threshold && !over_limit { self.split_keys.push(keys::origin_key(key.key()).to_vec()); // if for previous on_kv() self.current_count == self.split_threshold, - // the split key would be pushed this time, but the entry for this time should not be ignored. + // the split key would be pushed this time, but the entry for this time should + // not be ignored. self.current_count = 1; over_limit = self.split_keys.len() as u64 >= self.batch_split_limit; } @@ -115,29 +111,19 @@ where } #[derive(Clone)] -pub struct KeysCheckObserver { - router: Arc>, - _phantom: PhantomData, +pub struct KeysCheckObserver { + router: C, } -impl, E> KeysCheckObserver -where - E: KvEngine, -{ - pub fn new(router: C) -> KeysCheckObserver { - KeysCheckObserver { - router: Arc::new(Mutex::new(router)), - _phantom: PhantomData, - } +impl KeysCheckObserver { + pub fn new(router: C) -> KeysCheckObserver { + KeysCheckObserver { router } } } -impl Coprocessor for KeysCheckObserver {} +impl Coprocessor for KeysCheckObserver {} -impl + Send, E> SplitCheckObserver for KeysCheckObserver -where - E: KvEngine, -{ +impl SplitCheckObserver for KeysCheckObserver { fn add_checker( &self, ctx: &mut ObserverContext<'_>, @@ -171,23 +157,20 @@ where } }; - let res = CasualMessage::RegionApproximateKeys { keys: region_keys }; - if let Err(e) = self.router.lock().unwrap().send(region_id, res) { - warn!( - "failed to send approximate region keys"; - "region_id" => region_id, - "err" => %e, - "error_code" => %e.error_code(), - ); - } + self.router.update_approximate_keys(region_id, region_keys); REGION_KEYS_HISTOGRAM.observe(region_keys as f64); - if region_keys >= host.cfg.region_max_keys() { + // if bucket checker using scan is added, to utilize the scan, + // add keys checker as well for free + // It has the assumption that the size's checker is before the keys's check in + // the host + let need_split_region = region_keys >= host.cfg.region_max_keys(); + if need_split_region { info!( "approximate keys over threshold, need to do split check"; "region_id" => region.get_id(), "keys" => region_keys, - "threshold" => host.cfg.region_max_keys, + "threshold" => host.cfg.region_max_keys(), ); // Need to check keys. host.add_checker(Box::new(Checker::new( @@ -226,7 +209,7 @@ pub fn get_region_approximate_keys( mod tests { use std::{cmp, sync::mpsc, u64}; - use engine_test::ctor::{CFOptions, ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{CfOptions, DbOptions}; use engine_traits::{KvEngine, MiscExt, SyncMutable, ALL_CFS, CF_DEFAULT, CF_WRITE, LARGE_CFS}; use kvproto::{ metapb::{Peer, Region}, @@ -247,8 +230,8 @@ mod tests { *, }; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{SplitCheckRunner, SplitCheckTask}, }; fn put_data(engine: &impl KvEngine, mut start_idx: u64, end_idx: u64, fill_short_value: bool) { @@ -286,13 +269,7 @@ mod tests { fn test_split_check() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ColumnFamilyOptions::new(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -323,8 +300,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -396,13 +373,7 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ColumnFamilyOptions::new(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -433,8 +404,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -459,13 +430,10 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let cases = [("a", 1024), ("b", 2048), ("c", 4096)]; @@ -571,13 +539,7 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ColumnFamilyOptions::new(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -593,7 +555,7 @@ mod tests { region_max_keys: Some(159), region_split_keys: Some(80), batch_split_limit: 5, - enable_region_bucket: true, + enable_region_bucket: Some(true), // need check split region buckets, but region size does not exceed the split threshold region_bucket_size: ReadableSize(100), ..Default::default() @@ -614,8 +576,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -625,10 +587,10 @@ mod tests { let region_size = get_region_approximate_size(&engine, ®ion, ReadableSize::mb(1000).0).unwrap(); // to make the region_max_size < region_split_size + region_size - // The split by keys should still work. But if the bug in on_kv() in size.rs exists, - // it will result in split by keys failed. + // The split by keys should still work. But if the bug in on_kv() in size.rs + // exists, it will result in split by keys failed. cfg.region_max_size = Some(ReadableSize(region_size * 6 / 5)); - cfg.region_split_size = ReadableSize(region_size * 4 / 5); + cfg.region_split_size = Some(ReadableSize(region_size * 4 / 5)); runnable = SplitCheckRunner::new(engine, tx.clone(), CoprocessorHost::new(tx, cfg)); runnable.run(SplitCheckTask::split_check( region.clone(), @@ -648,13 +610,10 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); // size >= 4194304 will insert a new point in range properties diff --git a/components/raftstore/src/coprocessor/split_check/mod.rs b/components/raftstore/src/coprocessor/split_check/mod.rs index 9f1cbf17eb1..e92000f2c95 100644 --- a/components/raftstore/src/coprocessor/split_check/mod.rs +++ b/components/raftstore/src/coprocessor/split_check/mod.rs @@ -92,8 +92,8 @@ impl<'a, E> Host<'a, E> { const MIN_BUCKET_COUNT_PER_REGION: u64 = 2; if region_size >= self.cfg.region_bucket_size.0 * MIN_BUCKET_COUNT_PER_REGION { let mut bucket_checker = size::Checker::new( - self.cfg.region_bucket_size.0, /* not used */ - self.cfg.region_bucket_size.0, /* not used */ + self.cfg.region_bucket_size.0, // not used + self.cfg.region_bucket_size.0, // not used region_size / self.cfg.region_bucket_size.0, CheckPolicy::Approximate, ); @@ -120,7 +120,7 @@ impl<'a, E> Host<'a, E> { #[inline] pub fn enable_region_bucket(&self) -> bool { - self.cfg.enable_region_bucket + self.cfg.enable_region_bucket() } #[inline] diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 59603782f5c..4b320bef1b6 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -1,10 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{Arc, Mutex}, -}; - use engine_traits::{KvEngine, Range}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; @@ -17,7 +12,7 @@ use super::{ }, calc_split_keys_count, Host, }; -use crate::store::{CasualMessage, CasualRouter}; +use crate::coprocessor::dispatcher::StoreHandle; pub struct Checker { max_size: u64, @@ -51,13 +46,6 @@ where E: KvEngine, { fn on_kv(&mut self, _: &mut ObserverContext<'_>, entry: &KeyEntry) -> bool { - // If there's no need to check region split, skip it. - // Otherwise, the region whose keys > max region keys will not be splitted when batch_split_limit is zero, - // because eventually "over_limit && self.current_size + self.split_size >= self.max_size" - // will return true. - if self.batch_split_limit == 0 { - return false; - } let size = entry.entry_size() as u64; self.current_size += size; @@ -65,7 +53,8 @@ where if self.current_size > self.split_size && !over_limit { self.split_keys.push(keys::origin_key(entry.key()).to_vec()); // if for previous on_kv() self.current_size == self.split_size, - // the split key would be pushed this time, but the entry size for this time should not be ignored. + // the split key would be pushed this time, but the entry size for this time + // should not be ignored. self.current_size = if self.current_size - size == self.split_size { size } else { @@ -122,29 +111,19 @@ where } #[derive(Clone)] -pub struct SizeCheckObserver { - router: Arc>, - _phantom: PhantomData, +pub struct SizeCheckObserver { + router: C, } -impl, E> SizeCheckObserver -where - E: KvEngine, -{ - pub fn new(router: C) -> SizeCheckObserver { - SizeCheckObserver { - router: Arc::new(Mutex::new(router)), - _phantom: PhantomData, - } +impl SizeCheckObserver { + pub fn new(router: C) -> SizeCheckObserver { + SizeCheckObserver { router } } } -impl Coprocessor for SizeCheckObserver {} +impl Coprocessor for SizeCheckObserver {} -impl + Send, E> SplitCheckObserver for SizeCheckObserver -where - E: KvEngine, -{ +impl SplitCheckObserver for SizeCheckObserver { fn add_checker( &self, ctx: &mut ObserverContext<'_>, @@ -170,7 +149,7 @@ where // Need to check size. host.add_checker(Box::new(Checker::new( host.cfg.region_max_size().0, - host.cfg.region_split_size.0, + host.cfg.region_split_size().0, host.cfg.batch_split_limit, policy, ))); @@ -179,28 +158,21 @@ where }; // send it to raftstore to update region approximate size - let res = CasualMessage::RegionApproximateSize { size: region_size }; - if let Err(e) = self.router.lock().unwrap().send(region_id, res) { - warn!( - "failed to send approximate region size"; - "region_id" => region_id, - "err" => %e, - "error_code" => %e.error_code(), - ); - } + self.router.update_approximate_size(region_id, region_size); + let need_bucket_checker = + host.cfg.enable_region_bucket() && region_size >= 2 * host.cfg.region_bucket_size.0; REGION_SIZE_HISTOGRAM.observe(region_size as f64); - if region_size >= host.cfg.region_max_size().0 - || host.cfg.enable_region_bucket && region_size >= 2 * host.cfg.region_bucket_size.0 - { - let batch_split_limit = if region_size >= host.cfg.region_max_size().0 { - host.cfg.batch_split_limit - } else { - // no region split check needed - 0 - }; + + let need_split_region = region_size >= host.cfg.region_max_size().0; + if need_split_region || need_bucket_checker { // when it's a large region use approximate way to produce split keys - if region_size >= host.cfg.region_size_threshold_for_approximate.0 { + if need_split_region { + if region_size >= host.cfg.region_size_threshold_for_approximate.0 { + policy = CheckPolicy::Approximate; + } + } else if host.cfg.prefer_approximate_bucket { + // when the check is only for bucket, use approximate anyway policy = CheckPolicy::Approximate; } @@ -210,13 +182,12 @@ where "size" => region_size, "threshold" => host.cfg.region_max_size().0, "policy" => ?policy, - "split_check" => batch_split_limit > 0, ); // Need to check size. host.add_checker(Box::new(Checker::new( host.cfg.region_max_size().0, - host.cfg.region_split_size.0, - batch_split_limit, + host.cfg.region_split_size().0, + host.cfg.batch_split_limit, policy, ))); } else { @@ -262,11 +233,11 @@ pub fn get_approximate_split_keys( #[cfg(test)] pub mod tests { - use std::{iter, sync::mpsc, u64}; + use std::{assert_matches::assert_matches, iter, sync::mpsc, u64}; use collections::HashSet; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions}, + ctor::{CfOptions, DbOptions}, kv::KvTestEngine, }; use engine_traits::{ @@ -282,30 +253,31 @@ pub mod tests { use super::{Checker, *}; use crate::{ - coprocessor::{Config, CoprocessorHost, ObserverContext, SplitChecker}, - store::{BucketRange, CasualMessage, KeyEntry, SplitCheckRunner, SplitCheckTask}, + coprocessor::{ + dispatcher::SchedTask, Config, CoprocessorHost, ObserverContext, SplitChecker, + }, + store::{BucketRange, KeyEntry, SplitCheckRunner, SplitCheckTask}, }; fn must_split_at_impl( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys: Vec>, ignore_split_keys: bool, ) { loop { match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) + | Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } - Ok(( + Ok(SchedTask::AskSplit { region_id, - CasualMessage::SplitRegion { - region_epoch, - split_keys, - .. - }, - )) => { + region_epoch, + split_keys, + .. + }) => { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); if !ignore_split_keys { @@ -313,14 +285,13 @@ pub mod tests { } break; } - Ok((_region_id, CasualMessage::RefreshRegionBuckets { .. })) => {} others => panic!("expect split check result, but got {:?}", others), } } } pub fn must_split_at( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys: Vec>, ) { @@ -328,50 +299,36 @@ pub mod tests { } pub fn must_split_with( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys_count: usize, ) { loop { match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) + | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } - Ok(( + Ok(SchedTask::AskSplit { region_id, - CasualMessage::SplitRegion { - region_epoch, - split_keys, - .. - }, - )) => { + region_epoch, + split_keys, + .. + }) => { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); assert_eq!(split_keys.len(), exp_split_keys_count); break; } - Ok((_region_id, CasualMessage::RefreshRegionBuckets { .. })) => {} others => panic!("expect split check result, but got {:?}", others), } } } - pub fn must_generate_buckets( - rx: &mpsc::Receiver<(u64, CasualMessage)>, - exp_buckets_keys: &[Vec], - ) { + pub fn must_generate_buckets(rx: &mpsc::Receiver, exp_buckets_keys: &[Vec]) { loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - mut buckets, - bucket_ranges: _, - .. - }, - )) = rx.try_recv() - { + if let Ok(SchedTask::RefreshRegionBuckets { mut buckets, .. }) = rx.try_recv() { let mut i = 0; if !exp_buckets_keys.is_empty() { let bucket = buckets.pop().unwrap(); @@ -389,23 +346,14 @@ pub mod tests { } pub fn must_generate_buckets_approximate( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, bucket_range: Option, min_leap: i32, max_leap: i32, mvcc: bool, ) { loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - mut buckets, - bucket_ranges: _, - .. - }, - )) = rx.try_recv() - { + if let Ok(SchedTask::RefreshRegionBuckets { mut buckets, .. }) = rx.try_recv() { let bucket_keys = buckets.pop().unwrap().keys; if let Some(bucket_range) = bucket_range { assert!(!bucket_keys.is_empty()); @@ -444,18 +392,18 @@ pub mod tests { fn test_split_check_impl(cfs_with_range_prop: &[CfName], data_cf: CfName) { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); + let db_opts = DbOptions::default(); let cfs_with_range_prop: HashSet<_> = cfs_with_range_prop.iter().cloned().collect(); - let mut cf_opt = ColumnFamilyOptions::new(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); let cfs_opts = ALL_CFS .iter() .map(|cf| { if cfs_with_range_prop.contains(cf) { - CFOptions::new(cf, ColumnFamilyOptions::new()) + (*cf, CfOptions::new()) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -472,7 +420,7 @@ pub mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(100)), - region_split_size: ReadableSize(60), + region_split_size: Some(ReadableSize(60)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, @@ -495,12 +443,7 @@ pub mod tests { None, )); // size has not reached the max_size 100 yet. - match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) => { - assert_eq!(region_id, region.get_id()); - } - others => panic!("expect recv empty, but got {:?}", others), - } + assert_matches!(rx.try_recv(), Ok(SchedTask::UpdateApproximateSize { region_id, .. }) if region_id == region.get_id()); for i in 7..11 { let s = keys::data_key(format!("{:04}", i).as_bytes()); @@ -571,9 +514,9 @@ pub mod tests { fn test_generate_bucket_impl(cfs_with_range_prop: &[CfName], data_cf: CfName, mvcc: bool) { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); + let db_opts = DbOptions::default(); let cfs_with_range_prop: HashSet<_> = cfs_with_range_prop.iter().cloned().collect(); - let mut cf_opt = ColumnFamilyOptions::new(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); cf_opt.set_disable_auto_compactions(true); @@ -581,11 +524,11 @@ pub mod tests { .iter() .map(|cf| { if cfs_with_range_prop.contains(cf) { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = CfOptions::new(); opt.set_disable_auto_compactions(true); - CFOptions::new(cf, opt) + (*cf, opt) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -602,11 +545,11 @@ pub mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(50000)), - region_split_size: ReadableSize(50000), + region_split_size: Some(ReadableSize(50000)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, - enable_region_bucket: true, + enable_region_bucket: Some(true), region_bucket_size: ReadableSize(3000), region_size_threshold_for_approximate: ReadableSize(50000), ..Default::default() @@ -619,11 +562,12 @@ pub mod tests { keys::data_key(Key::from_raw(bytes).append_ts(ts).as_encoded()) } }; - let mut runnable = - SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); - for i in 0..2000 { - // if not mvcc, kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has about 210 keys - // if mvcc, kv size is about 18*2 = 36, expect each bucket has about 80 keys + let cop_host = CoprocessorHost::new(tx.clone(), cfg); + let mut runnable = SplitCheckRunner::new(engine.clone(), tx, cop_host.clone()); + for i in 0..1000 { + // if not mvcc, kv size is (6+1)*2 = 14, given bucket size is 3000, expect each + // bucket has about 210 keys if mvcc, kv size is about 18*2 = 36, expect each + // bucket has about 80 keys let s = key_gen(format!("{:04}00", i).as_bytes(), mvcc, i.into()); engine.put_cf(data_cf, &s, &s).unwrap(); if i % 10 == 0 && i > 0 { @@ -638,6 +582,9 @@ pub mod tests { None, )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Approximate); + if !mvcc { must_generate_buckets_approximate(&rx, None, 15000, 45000, mvcc); } else { @@ -648,9 +595,10 @@ pub mod tests { let end = format!("{:04}", 20).into_bytes(); // insert keys into 0000 ~ 0020 with 000000 ~ 002000 - for i in 0..2000 { - // kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has about 210 keys - // if mvcc, kv size is about 18*2 = 36, expect each bucket has about 80 keys + for i in 0..1000 { + // kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has + // about 210 keys if mvcc, kv size is about 18*2 = 36, expect each bucket has + // about 80 keys let s = key_gen(format!("{:06}", i).as_bytes(), mvcc, i.into()); engine.put_cf(data_cf, &s, &s).unwrap(); if i % 10 == 0 { @@ -664,11 +612,13 @@ pub mod tests { CheckPolicy::Approximate, Some(vec![BucketRange(start.clone(), end.clone())]), )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Approximate); if !mvcc { - must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 150, 450, mvcc); + must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 75, 225, mvcc); } else { - must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 70, 150, mvcc); + must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 35, 85, mvcc); } drop(rx); } @@ -684,33 +634,91 @@ pub mod tests { #[test] fn test_generate_bucket_by_approximate() { - for cf in LARGE_CFS { - test_generate_bucket_impl(LARGE_CFS, cf, false); - } + test_generate_bucket_impl(LARGE_CFS, CF_WRITE, false); } #[test] fn test_generate_bucket_mvcc_by_approximate() { - for cf in LARGE_CFS { - test_generate_bucket_impl(LARGE_CFS, cf, true); + test_generate_bucket_impl(LARGE_CFS, CF_DEFAULT, true); + } + + #[test] + fn test_check_policy_for_bucket_generation() { + let path = Builder::new() + .prefix("test_check_policy_for_bucket_generation") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let db_opts = DbOptions::default(); + let cfs_with_range_prop: HashSet<_> = LARGE_CFS.iter().cloned().collect(); + let mut cf_opt = CfOptions::new(); + cf_opt.set_no_range_properties(true); + cf_opt.set_disable_auto_compactions(true); + + let cfs_opts = ALL_CFS + .iter() + .map(|cf| { + if cfs_with_range_prop.contains(cf) { + let mut opt = CfOptions::new(); + opt.set_disable_auto_compactions(true); + (*cf, opt) + } else { + (*cf, cf_opt.clone()) + } + }) + .collect(); + let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let (tx, _rx) = mpsc::sync_channel(100); + let mut cfg = Config { + region_max_size: Some(ReadableSize(50000)), + region_split_size: Some(ReadableSize(50000)), + region_max_keys: Some(1000000), + region_split_keys: Some(1000000), + batch_split_limit: 5, + enable_region_bucket: Some(true), + region_bucket_size: ReadableSize(1), // minimal bucket size + region_size_threshold_for_approximate: ReadableSize(500000000), + // follow split region's check policy, not force to use approximate + prefer_approximate_bucket: false, + ..Default::default() + }; + let mut region = Region::default(); + region.set_id(1); + region.set_start_key(vec![]); + region.set_end_key(vec![]); + region.mut_peers().push(Peer::default()); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(5); + for i in 0..20 { + let s = keys::data_key(format!("{:04}00", i).as_bytes()); + engine.put_cf(CF_WRITE, &s, &s).unwrap(); } + + let cop_host = CoprocessorHost::new(tx.clone(), cfg.clone()); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Scan); + + cfg.prefer_approximate_bucket = true; + let cop_host = CoprocessorHost::new(tx, cfg); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Approximate); } #[test] fn test_cf_lock_without_range_prop() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opt = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); let cfs_opts = ALL_CFS .iter() .map(|cf| { if cf != &CF_LOCK { - CFOptions::new(cf, ColumnFamilyOptions::new()) + (*cf, CfOptions::new()) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -728,7 +736,7 @@ pub mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(100)), - region_split_size: ReadableSize(60), + region_split_size: Some(ReadableSize(60)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, @@ -767,13 +775,13 @@ pub mod tests { let cfs_opts = ALL_CFS .iter() .map(|cf| { - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = CfOptions::new(); cf_opts.set_no_range_properties(true); - CFOptions::new(cf, cf_opts) + (*cf, cf_opts) }) .collect(); let engine = - engine_test::kv::new_engine_opt(path_str, DBOptions::default(), cfs_opts).unwrap(); + engine_test::kv::new_engine_opt(path_str, DbOptions::default(), cfs_opts).unwrap(); let mut runnable = SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); @@ -846,15 +854,12 @@ pub mod tests { .unwrap(); let path = tmp.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); cf_opts.set_no_range_properties(true); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); let region = make_region(1, vec![], vec![]); @@ -884,13 +889,10 @@ pub mod tests { .unwrap(); let path = tmp.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); let mut big_value = Vec::with_capacity(256); @@ -988,7 +990,7 @@ pub mod tests { #[test] fn test_get_approximate_split_keys() { for cf in LARGE_CFS { - test_get_approximate_split_keys_impl(*cf); + test_get_approximate_split_keys_impl(cf); } } @@ -999,13 +1001,10 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let cases = [("a", 1024), ("b", 2048), ("c", 4096)]; @@ -1032,13 +1031,10 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_disable_auto_compactions(true); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let mut cf_size = 0; @@ -1070,13 +1066,10 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_disable_auto_compactions(true); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let mut cf_size = 0; diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index a8a1ded4144..eec7b15b9b3 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; -use engine_traits::{IterOptions, Iterator, KvEngine, SeekKey, CF_WRITE}; +use engine_traits::{IterOptions, Iterator, KvEngine, CF_WRITE}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; use tidb_query_datatype::codec::table as table_codec; @@ -26,8 +26,9 @@ where E: KvEngine, { /// Feed keys in order to find the split key. - /// If `current_data_key` does not belong to `status.first_encoded_table_prefix`. - /// it returns the encoded table prefix of `current_data_key`. + /// If `current_data_key` does not belong to + /// `status.first_encoded_table_prefix`. it returns the encoded table + /// prefix of `current_data_key`. fn on_kv(&mut self, _: &mut ObserverContext<'_>, entry: &KeyEntry) -> bool { if self.split_key.is_some() { return true; @@ -183,10 +184,10 @@ fn last_key_of_region(db: &impl KvEngine, region: &Region) -> Result = iter.seek(SeekKey::End).map_err(|e| box_err!(e)); + let found: Result = iter.seek_to_last().map_err(|e| box_err!(e)); if found? { let key = iter.key().to_vec(); last_key = Some(key); @@ -237,8 +238,8 @@ mod tests { use super::*; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{SplitCheckRunner, SplitCheckTask}, }; /// Composes table record and index prefix: `t[table_id]`. @@ -256,7 +257,7 @@ mod tests { .prefix("test_last_key_of_region") .tempdir() .unwrap(); - let engine = new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let engine = new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -309,7 +310,7 @@ mod tests { .prefix("test_table_check_observer") .tempdir() .unwrap(); - let engine = new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let engine = new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -325,7 +326,7 @@ mod tests { split_region_on_table: true, // Try to "disable" size split. region_max_size: Some(ReadableSize::gb(2)), - region_split_size: ReadableSize::gb(1), + region_split_size: Some(ReadableSize::gb(1)), // Try to "disable" keys split region_max_keys: Some(2000000000), region_split_keys: Some(1000000000), @@ -352,9 +353,9 @@ mod tests { let key = Key::from_raw(&gen_table_prefix(id)); loop { match rx.try_recv() { - Ok((_, CasualMessage::RegionApproximateSize { .. })) - | Ok((_, CasualMessage::RegionApproximateKeys { .. })) => (), - Ok((_, CasualMessage::SplitRegion { split_keys, .. })) => { + Ok(SchedTask::UpdateApproximateSize { .. }) + | Ok(SchedTask::UpdateApproximateKeys { .. }) => (), + Ok(SchedTask::AskSplit { split_keys, .. }) => { assert_eq!(split_keys, vec![key.into_encoded()]); break; } @@ -364,8 +365,8 @@ mod tests { } else { loop { match rx.try_recv() { - Ok((_, CasualMessage::RegionApproximateSize { .. })) - | Ok((_, CasualMessage::RegionApproximateKeys { .. })) => (), + Ok(SchedTask::UpdateApproximateSize { .. }) + | Ok(SchedTask::UpdateApproximateKeys { .. }) => (), Err(mpsc::TryRecvError::Empty) => { break; } diff --git a/components/raftstore/src/coprocessor/split_observer.rs b/components/raftstore/src/coprocessor/split_observer.rs index e763c83a37c..7f844f4b069 100644 --- a/components/raftstore/src/coprocessor/split_observer.rs +++ b/components/raftstore/src/coprocessor/split_observer.rs @@ -240,14 +240,13 @@ mod tests { let observer = SplitObserver; - let resp = observer.pre_propose_admin(&mut ctx, &mut req); // since no split is defined, actual coprocessor won't be invoke. - assert!(resp.is_ok()); + observer.pre_propose_admin(&mut ctx, &mut req).unwrap(); assert!(!req.has_split(), "only split req should be handle."); req = new_split_request(new_row_key(1, 2, 0)); // For compatible reason, split should supported too. - assert!(observer.pre_propose_admin(&mut ctx, &mut req).is_ok()); + observer.pre_propose_admin(&mut ctx, &mut req).unwrap(); // Empty key should be skipped. let mut split_keys = vec![vec![]]; @@ -257,7 +256,7 @@ mod tests { req = new_batch_split_request(split_keys.clone()); // Although invalid keys should be skipped, but if all keys are // invalid, errors should be reported. - assert!(observer.pre_propose_admin(&mut ctx, &mut req).is_err()); + observer.pre_propose_admin(&mut ctx, &mut req).unwrap_err(); let mut key = new_row_key(1, 2, 0); let mut expected_key = key[..key.len() - 8].to_vec(); diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 89648de7731..5deef832723 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -4,7 +4,7 @@ use std::{error::Error as StdError, io, net, result}; use crossbeam::channel::TrySendError; use error_code::{self, ErrorCode, ErrorCodeExt}; -use kvproto::{errorpb, metapb}; +use kvproto::{errorpb, metapb, raft_serverpb}; use protobuf::ProtobufError; use thiserror::Error; use tikv_util::{codec, deadline::DeadlineError}; @@ -58,6 +58,12 @@ pub enum Error { #[error("region {0} is in the recovery progress")] RecoveryInProgress(u64), + #[error("region {0} is in the flashback progress with start_ts {1}")] + FlashbackInProgress(u64, u64), + + #[error("region {0} not prepared the flashback")] + FlashbackNotPrepared(u64), + #[error( "key {} is not in region key range [{}, {}) for region {}", log_wrappers::Value::key(.0), @@ -128,6 +134,15 @@ pub enum Error { #[error("Prepare merge is pending due to unapplied proposals")] PendingPrepareMerge, + + #[error("Region not exist but not tombstone, region: {}, local_state: {:?}", .region_id, .local_state)] + RegionNotRegistered { + region_id: u64, + local_state: raft_serverpb::RegionLocalState, + }, + + #[error("peer is a witness of region {0}")] + IsWitness(u64), } pub type Result = result::Result; @@ -241,6 +256,22 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_recovery_in_progress(e); } + Error::FlashbackInProgress(region_id, flashback_start_ts) => { + let mut e = errorpb::FlashbackInProgress::default(); + e.set_region_id(region_id); + e.set_flashback_start_ts(flashback_start_ts); + errorpb.set_flashback_in_progress(e); + } + Error::FlashbackNotPrepared(region_id) => { + let mut e = errorpb::FlashbackNotPrepared::default(); + e.set_region_id(region_id); + errorpb.set_flashback_not_prepared(e); + } + Error::IsWitness(region_id) => { + let mut e = errorpb::IsWitness::default(); + e.set_region_id(region_id); + errorpb.set_is_witness(e); + } _ => {} }; @@ -275,6 +306,8 @@ impl ErrorCodeExt for Error { Error::NotLeader(..) => error_code::raftstore::NOT_LEADER, Error::DiskFull(..) => error_code::raftstore::DISK_FULL, Error::RecoveryInProgress(..) => error_code::raftstore::RECOVERY_IN_PROGRESS, + Error::FlashbackInProgress(..) => error_code::raftstore::FLASHBACK_IN_PROGRESS, + Error::FlashbackNotPrepared(..) => error_code::raftstore::FLASHBACK_NOT_PREPARED, Error::StaleCommand => error_code::raftstore::STALE_COMMAND, Error::RegionNotInitialized(_) => error_code::raftstore::REGION_NOT_INITIALIZED, Error::KeyNotInRegion(..) => error_code::raftstore::KEY_NOT_IN_REGION, @@ -295,8 +328,9 @@ impl ErrorCodeExt for Error { Error::DataIsNotReady { .. } => error_code::raftstore::DATA_IS_NOT_READY, Error::DeadlineExceeded => error_code::raftstore::DEADLINE_EXCEEDED, Error::PendingPrepareMerge => error_code::raftstore::PENDING_PREPARE_MERGE, + Error::IsWitness(..) => error_code::raftstore::IS_WITNESS, - Error::Other(_) => error_code::raftstore::UNKNOWN, + Error::Other(_) | Error::RegionNotRegistered { .. } => error_code::raftstore::UNKNOWN, } } } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index cd50b74dc48..1db5f79d226 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,25 +6,31 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] -#![feature(vec_retain_mut)] +#![feature(let_chains)] +#![feature(assert_matches)] +#![feature(type_alias_impl_trait)] #![recursion_limit = "256"] #[cfg(test)] extern crate test; #[macro_use] extern crate derivative; +#[cfg(feature = "engine_rocks")] +pub mod compacted_event_sender; pub mod coprocessor; pub mod errors; pub mod router; pub mod store; +#[cfg(feature = "engine_rocks")] +pub use self::compacted_event_sender::RaftRouterCompactedEventSender; pub use self::{ coprocessor::{RegionInfo, RegionInfoAccessor, SeekRegionCallback}, errors::{DiscardReason, Error, Result}, }; // `bytes::Bytes` is generated for `bytes` in protobuf. -fn bytes_capacity(b: &bytes::Bytes) -> usize { +pub fn bytes_capacity(b: &bytes::Bytes) -> usize { // NOTE: For deserialized raft messages, `len` equals capacity. // This is used to report memory usage to metrics. b.len() diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 72d2bf8ca2b..3a76a5ad26f 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,24 +1,25 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -// #[PerformanceCriticalPath] -use std::cell::RefCell; +use std::borrow::Cow; +// #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, Snapshot}; -use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; +use error_code::ErrorCodeExt; +use kvproto::{metapb, raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::SnapshotStatus; +use slog_global::warn; use tikv_util::time::ThreadReadId; use crate::{ store::{ - fsm::RaftRouter, - transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter}, + fsm::{ChangeObserver, RaftRouter}, + transport::{CasualRouter, ProposalRouter, SignificantRouter}, Callback, CasualMessage, LocalReader, PeerMsg, RaftCmdExtraOpts, RaftCommand, - SignificantMsg, StoreMsg, + SignificantMsg, StoreMsg, StoreRouter, }, DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, }; - /// Routes messages to the raftstore. pub trait RaftStoreRouter: StoreRouter @@ -116,13 +117,13 @@ where EK: KvEngine, { fn read( - &self, + &mut self, read_id: Option, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()>; - fn release_snapshot_cache(&self); + fn release_snapshot_cache(&mut self); } #[derive(Clone)] @@ -168,12 +169,20 @@ where } /// A router that routes messages to the raftstore -pub struct ServerRaftStoreRouter { +pub struct ServerRaftStoreRouter +where + EK: KvEngine, + ER: RaftEngine, +{ router: RaftRouter, - local_reader: RefCell, EK>>, + local_reader: LocalReader>, } -impl Clone for ServerRaftStoreRouter { +impl Clone for ServerRaftStoreRouter +where + EK: KvEngine, + ER: RaftEngine, +{ fn clone(&self) -> Self { ServerRaftStoreRouter { router: self.router.clone(), @@ -186,9 +195,8 @@ impl ServerRaftStoreRouter { /// Creates a new router. pub fn new( router: RaftRouter, - reader: LocalReader, EK>, + local_reader: LocalReader>, ) -> ServerRaftStoreRouter { - let local_reader = RefCell::new(reader); ServerRaftStoreRouter { router, local_reader, @@ -239,19 +247,17 @@ impl RaftStoreRouter for ServerRaftStoreRouter impl LocalReadRouter for ServerRaftStoreRouter { fn read( - &self, + &mut self, read_id: Option, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()> { - let mut local_reader = self.local_reader.borrow_mut(); - local_reader.read(read_id, req, cb); + self.local_reader.read(read_id, req, cb); Ok(()) } - fn release_snapshot_cache(&self) { - let mut local_reader = self.local_reader.borrow_mut(); - local_reader.release_snapshot_cache(); + fn release_snapshot_cache(&mut self) { + self.local_reader.release_snapshot_cache(); } } @@ -274,3 +280,172 @@ impl RaftStoreRouter for RaftRouter { batch_system::Router::broadcast_normal(self, msg_gen) } } + +// Because `CasualRouter` needs an generic while `RaftRotuer` doesn't. We have +// to bridge two by manually implementations. Using functions to reduce +// duplicated codes. + +impl crate::coprocessor::StoreHandle for RaftRouter { + fn update_approximate_size(&self, region_id: u64, size: u64) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::RegionApproximateSize { size }, + ) { + warn!( + "failed to send approximate region size"; + "region_id" => region_id, + "err" => %e, + "error_code" => %e.error_code(), + ); + } + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::RegionApproximateKeys { keys }, + ) { + warn!( + "failed to send approximate region keys"; + "region_id" => region_id, + "err" => %e, + "error_code" => %e.error_code(), + ); + } + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::SplitRegion { + region_epoch, + split_keys, + callback: Callback::None, + source, + }, + ) { + warn!( + "failed to send ask split"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::ComputeHashResult { + index, + context, + hash, + }, + ) { + warn!( + "failed to send hash compute result"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let _ = CasualRouter::send( + self, + region_id, + CasualMessage::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + cb: Callback::None, + }, + ); + } +} + +/// A handle for cdc and pitr to schedule some command back to raftstore. +pub trait CdcHandle: Clone + Send +where + EK: KvEngine, +{ + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback, + ) -> RaftStoreResult<()>; + + fn check_leadership( + &self, + region_id: u64, + callback: Callback, + ) -> RaftStoreResult<()>; +} + +/// A wrapper of SignificantRouter that is specialized for implementing +/// CdcHandle. +#[derive(Clone)] +pub struct CdcRaftRouter(pub T); + +impl std::ops::Deref for CdcRaftRouter { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl CdcHandle for CdcRaftRouter +where + EK: KvEngine, + T: SignificantRouter + Send + Clone, +{ + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback, + ) -> RaftStoreResult<()> { + self.0.significant_send( + region_id, + SignificantMsg::CaptureChange { + cmd: change_observer, + region_epoch, + callback, + }, + ) + } + + fn check_leadership( + &self, + region_id: u64, + callback: Callback, + ) -> RaftStoreResult<()> { + self.0 + .significant_send(region_id, SignificantMsg::LeaderCallback(callback)) + } +} diff --git a/components/raftstore/src/store/async_io/mod.rs b/components/raftstore/src/store/async_io/mod.rs index c9b2fad532f..56cc2d576e1 100644 --- a/components/raftstore/src/store/async_io/mod.rs +++ b/components/raftstore/src/store/async_io/mod.rs @@ -1,4 +1,5 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +pub mod read; pub mod write; pub mod write_router; diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs new file mode 100644 index 00000000000..006fe0eb24c --- /dev/null +++ b/components/raftstore/src/store/async_io/read.rs @@ -0,0 +1,263 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt, + marker::PhantomData, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use engine_traits::{Checkpointer, KvEngine, RaftEngine}; +use fail::fail_point; +use file_system::{IoType, WithIoType}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; +use protobuf::Message; +use raft::{eraftpb::Snapshot, GetEntriesContext}; +use tikv_util::{error, info, time::Instant, worker::Runnable}; + +use crate::store::{ + metrics::{SNAPSHOT_KV_COUNT_HISTOGRAM, SNAPSHOT_SIZE_HISTOGRAM}, + snap::TABLET_SNAPSHOT_VERSION, + util, + worker::metrics::{SNAP_COUNTER, SNAP_HISTOGRAM}, + RaftlogFetchResult, TabletSnapKey, TabletSnapManager, MAX_INIT_ENTRY_COUNT, +}; + +pub enum ReadTask { + FetchLogs { + region_id: u64, + context: GetEntriesContext, + low: u64, + high: u64, + max_size: usize, + tried_cnt: usize, + term: u64, + }, + + // GenTabletSnapshot is used to generate tablet snapshot. + GenTabletSnapshot { + region_id: u64, + to_peer: u64, + tablet: EK, + region_state: RegionLocalState, + last_applied_term: u64, + last_applied_index: u64, + canceled: Arc, + for_balance: bool, + }, +} + +impl fmt::Display for ReadTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ReadTask::FetchLogs { + region_id, + context, + low, + high, + max_size, + tried_cnt, + term, + } => write!( + f, + "Fetch Raft Logs [region: {}, low: {}, high: {}, max_size: {}] for sending with context {:?}, tried: {}, term: {}", + region_id, low, high, max_size, context, tried_cnt, term, + ), + ReadTask::GenTabletSnapshot { + region_id, to_peer, .. + } => { + write!(f, "Snapshot gen for {}, to peer {}", region_id, to_peer) + } + } + } +} + +#[derive(Debug)] +pub struct FetchedLogs { + pub context: GetEntriesContext, + pub logs: Box, +} + +pub type GenSnapRes = Option>; + +/// A router for receiving fetched result. +pub trait AsyncReadNotifier: Send { + fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs); + fn notify_snapshot_generated(&self, region_id: u64, res: GenSnapRes); +} + +pub struct ReadRunner +where + EK: KvEngine, + ER: RaftEngine, + N: AsyncReadNotifier, +{ + notifier: N, + raft_engine: ER, + sanp_mgr: Option, + _phantom: PhantomData, +} + +impl ReadRunner { + pub fn new(notifier: N, raft_engine: ER) -> ReadRunner { + ReadRunner { + notifier, + raft_engine, + sanp_mgr: None, + _phantom: PhantomData, + } + } + + #[inline] + pub fn set_snap_mgr(&mut self, mgr: TabletSnapManager) { + self.sanp_mgr = Some(mgr); + } + + #[inline] + fn snap_mgr(&self) -> &TabletSnapManager { + self.sanp_mgr.as_ref().unwrap() + } + + fn generate_snap(&self, snap_key: &TabletSnapKey, tablet: EK) -> crate::Result<()> { + let checkpointer_path = self.snap_mgr().tablet_gen_path(snap_key); + if checkpointer_path.exists() { + // TODO: make `delete_snapshot` return error so we can use it here. + // Remove the old checkpoint directly. + encryption::trash_dir_all( + &checkpointer_path, + self.snap_mgr().key_manager().as_deref(), + )?; + } + // Here not checkpoint to a temporary directory first, the temporary directory + // logic already implemented in rocksdb. + let mut checkpointer = tablet.new_checkpointer()?; + checkpointer.create_at(checkpointer_path.as_path(), None, 0)?; + Ok(()) + } +} + +impl Runnable for ReadRunner +where + EK: KvEngine, + ER: RaftEngine, + N: AsyncReadNotifier, +{ + type Task = ReadTask; + fn run(&mut self, task: ReadTask) { + match task { + ReadTask::FetchLogs { + region_id, + low, + high, + max_size, + context, + tried_cnt, + term, + } => { + let _guard = WithIoType::new(IoType::Replication); + let mut ents = + Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); + let res = self.raft_engine.fetch_entries_to( + region_id, + low, + high, + Some(max_size), + &mut ents, + ); + + let hit_size_limit = res + .as_ref() + .map(|c| (*c as u64) != high - low) + .unwrap_or(false); + fail_point!("worker_async_fetch_raft_log"); + self.notifier.notify_logs_fetched( + region_id, + FetchedLogs { + context, + logs: Box::new(RaftlogFetchResult { + ents: res.map(|_| ents).map_err(|e| e.into()), + low, + max_size: max_size as u64, + hit_size_limit, + tried_cnt, + term, + }), + }, + ); + } + + ReadTask::GenTabletSnapshot { + region_id, + to_peer, + tablet, + region_state, + last_applied_term, + last_applied_index, + canceled, + for_balance, + } => { + SNAP_COUNTER.generate.start.inc(); + if canceled.load(Ordering::Relaxed) { + info!("generate snap is canceled"; "region_id" => region_id); + SNAP_COUNTER.generate.abort.inc(); + return; + } + let start = Instant::now(); + let _io_type_guard = WithIoType::new(if for_balance { + IoType::LoadBalance + } else { + IoType::Replication + }); + // the state should already checked in apply workers. + assert_ne!(region_state.get_state(), PeerState::Tombstone); + let mut snapshot = Snapshot::default(); + // Set snapshot metadata. + snapshot.mut_metadata().set_term(last_applied_term); + snapshot.mut_metadata().set_index(last_applied_index); + let conf_state = util::conf_state_from_region(region_state.get_region()); + snapshot.mut_metadata().set_conf_state(conf_state); + + // Set snapshot data. + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region_state.get_region().clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_balance(for_balance); + snap_data.set_removed_records(region_state.get_removed_records().into()); + snap_data.set_merged_records(region_state.get_merged_records().into()); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + // create checkpointer. + let snap_key = TabletSnapKey::from_region_snap(region_id, to_peer, &snapshot); + let mut res = None; + let total_size = tablet.get_engine_used_size().unwrap_or(0); + let total_keys = tablet.get_num_keys().unwrap_or(0); + if let Err(e) = self.generate_snap(&snap_key, tablet) { + error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); + SNAP_COUNTER.generate.fail.inc(); + } else { + let generate_duration_secs = start.saturating_elapsed().as_secs(); + let elapsed = start.saturating_elapsed_secs(); + info!( + "snapshot generated"; + "region_id" => region_id, + "elapsed" => elapsed, + "key" => ?snap_key, + "for_balance" => for_balance, + "total_size" => total_size, + "total_keys" => total_keys, + ); + self.snap_mgr() + .begin_snapshot(snap_key, start, generate_duration_secs); + SNAP_COUNTER.generate.success.inc(); + SNAP_HISTOGRAM.generate.observe(elapsed); + SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); + SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_keys as f64); + res = Some(Box::new((snapshot, to_peer))) + } + + self.notifier.notify_snapshot_generated(region_id, res); + } + } + } +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index c9490738da4..0da8d1546b5 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -8,37 +8,46 @@ //! raft db and then invoking callback or sending msgs if any. use std::{ - fmt, + fmt, mem, sync::Arc, thread::{self, JoinHandle}, }; use collections::HashMap; -use crossbeam::channel::{bounded, Receiver, Sender, TryRecvError}; +use crossbeam::channel::TryRecvError; use engine_traits::{ - Engines, KvEngine, PerfContext, PerfContextKind, RaftEngine, RaftLogBatch, WriteBatch, - WriteOptions, + KvEngine, PerfContext, PerfContextKind, RaftEngine, RaftLogBatch, WriteBatch, WriteOptions, }; use error_code::ErrorCodeExt; use fail::fail_point; +use file_system::{set_io_type, IoType}; use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; +use parking_lot::Mutex; use protobuf::Message; use raft::eraftpb::Entry; +use resource_control::{ + channel::{bounded, Receiver}, + ResourceConsumeType, ResourceController, ResourceMetered, +}; use tikv_util::{ box_err, - config::{Tracker, VersionTrack}, - debug, info, slow_log, thd_name, + config::{ReadableSize, Tracker, VersionTrack}, + debug, info, slow_log, + sys::thread::StdThreadBuildWrapper, + thd_name, time::{duration_to_sec, Instant}, warn, }; +use super::write_router::{SharedSenders, WriteSenders}; use crate::{ store::{ config::Config, fsm::RaftRouter, - local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics}, + local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics, TimeTracker}, metrics::*, transport::Transport, + util, util::LatencyInspector, PeerMsg, }, @@ -49,18 +58,19 @@ const KV_WB_SHRINK_SIZE: usize = 1024 * 1024; const KV_WB_DEFAULT_SIZE: usize = 16 * 1024; const RAFT_WB_SHRINK_SIZE: usize = 10 * 1024 * 1024; const RAFT_WB_DEFAULT_SIZE: usize = 256 * 1024; +const RAFT_WB_SPLIT_SIZE: usize = ReadableSize::gb(1).0 as usize; /// Notify the event to the specified region. -pub trait Notifier: Clone + Send + 'static { - fn notify_persisted(&self, region_id: u64, peer_id: u64, ready_number: u64); +pub trait PersistedNotifier: Clone + Send + 'static { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64); } -impl Notifier for RaftRouter +impl PersistedNotifier for RaftRouter where EK: KvEngine, ER: RaftEngine, { - fn notify_persisted(&self, region_id: u64, peer_id: u64, ready_number: u64) { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64) { if let Err(e) = self.force_send( region_id, PeerMsg::Persisted { @@ -79,7 +89,98 @@ where } } -/// WriteTask contains write tasks which need to be persisted to kv db and raft db. +/// Extra writes besides raft engine. +/// +/// For now, applying snapshot needs to persist some extra states. For v1, +/// these states are written to KvEngine. For v2, they are written to +/// RaftEngine. Although in v2 these states are also written to raft engine, +/// but we have to use `ExtraState` as they should be written as the last +/// updates. +// TODO: perhaps we should always pass states instead of a write batch even +// for v1. +pub enum ExtraWrite { + None, + V1(W), + V2(L), +} + +impl ExtraWrite { + #[inline] + pub fn is_empty(&self) -> bool { + match self { + ExtraWrite::None => true, + ExtraWrite::V1(w) => w.is_empty(), + ExtraWrite::V2(l) => l.is_empty(), + } + } + + #[inline] + fn data_size(&self) -> usize { + match self { + ExtraWrite::None => 0, + ExtraWrite::V1(w) => w.data_size(), + ExtraWrite::V2(l) => l.persist_size(), + } + } + + #[inline] + pub fn ensure_v1(&mut self, write_batch: impl FnOnce() -> W) -> &mut W { + if let ExtraWrite::None = self { + *self = ExtraWrite::V1(write_batch()); + } else if let ExtraWrite::V2(_) = self { + unreachable!("v1 and v2 are mixed used"); + } + match self { + ExtraWrite::V1(w) => w, + _ => unreachable!(), + } + } + + #[inline] + pub fn v1_mut(&mut self) -> Option<&mut W> { + if let ExtraWrite::V1(w) = self { + Some(w) + } else { + None + } + } + + #[inline] + pub fn ensure_v2(&mut self, log_batch: impl FnOnce() -> L) -> &mut L { + if let ExtraWrite::None = self { + *self = ExtraWrite::V2(log_batch()); + } else if let ExtraWrite::V1(_) = self { + unreachable!("v1 and v2 are mixed used"); + } + match self { + ExtraWrite::V2(l) => l, + _ => unreachable!(), + } + } + + #[inline] + pub fn merge_v2(&mut self, log_batch: L) { + if let ExtraWrite::None = self { + *self = ExtraWrite::V2(log_batch); + } else if let ExtraWrite::V1(_) = self { + unreachable!("v1 and v2 are mixed used"); + } else if let ExtraWrite::V2(l) = self { + l.merge(log_batch).unwrap(); + } + } + + #[inline] + pub fn v2_mut(&mut self) -> Option<&mut L> { + if let ExtraWrite::V2(l) = self { + Some(l) + } else { + None + } + } +} + +/// WriteTask contains write tasks which need to be persisted to kv db and raft +/// db. pub struct WriteTask where EK: KvEngine, @@ -89,13 +190,16 @@ where peer_id: u64, ready_number: u64, pub send_time: Instant, - pub kv_wb: Option, pub raft_wb: Option, - pub entries: Vec, - pub cut_logs: Option<(u64, u64)>, + // called after writing to kvdb and raftdb. + pub persisted_cbs: Vec>, + overwrite_to: Option, + entries: Vec, pub raft_state: Option, + pub extra_write: ExtraWrite, pub messages: Vec, - pub request_times: Vec, + pub trackers: Vec, + pub has_snapshot: bool, } impl WriteTask @@ -109,24 +213,41 @@ where peer_id, ready_number, send_time: Instant::now(), - kv_wb: None, raft_wb: None, + overwrite_to: None, entries: vec![], - cut_logs: None, raft_state: None, + extra_write: ExtraWrite::None, messages: vec![], - request_times: vec![], + trackers: vec![], + persisted_cbs: Vec::new(), + has_snapshot: false, } } pub fn has_data(&self) -> bool { !(self.raft_state.is_none() && self.entries.is_empty() - && self.cut_logs.is_none() - && self.kv_wb.as_ref().map_or(true, |wb| wb.is_empty()) + && self.extra_write.is_empty() && self.raft_wb.as_ref().map_or(true, |wb| wb.is_empty())) } + /// Append continous entries. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. If + /// entries is empty, nothing will be deleted. + pub fn set_append(&mut self, overwrite_to: Option, entries: Vec) { + self.entries = entries; + self.overwrite_to = overwrite_to; + } + + #[inline] + pub fn ready_number(&self) -> u64 { + self.ready_number + } + /// Sanity check for robustness. pub fn valid(&self) -> Result<()> { if self.region_id == 0 || self.peer_id == 0 || self.ready_number == 0 { @@ -137,18 +258,6 @@ where self.ready_number )); } - if let Some(last_index) = self.entries.last().map(|e| e.get_index()) { - if let Some((from, _)) = self.cut_logs { - if from != last_index + 1 { - // Entries are put and deleted in the same writebatch. - return Err(box_err!( - "invalid cut logs, last_index {}, cut_logs {:?}", - last_index, - self.cut_logs - )); - } - } - } Ok(()) } @@ -166,6 +275,38 @@ where inspector: Vec, }, Shutdown, + #[cfg(test)] + Pause(std::sync::mpsc::Receiver<()>), +} + +impl ResourceMetered for WriteMsg +where + EK: KvEngine, + ER: RaftEngine, +{ + fn consume_resource(&self, resource_ctl: &Arc) -> Option { + match self { + WriteMsg::WriteTask(t) => { + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; + for entry in &t.entries { + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); + let write_bytes = entry.compute_size() as u64; + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } + } + Some(dominant_group) + } + _ => None, + } + } } impl fmt::Debug for WriteMsg @@ -182,6 +323,45 @@ where ), WriteMsg::Shutdown => write!(fmt, "WriteMsg::Shutdown"), WriteMsg::LatencyInspect { .. } => write!(fmt, "WriteMsg::LatencyInspect"), + #[cfg(test)] + WriteMsg::Pause(_) => write!(fmt, "WriteMsg::Pause"), + } + } +} + +pub enum ExtraBatchWrite { + None, + V1(W), + V2(L), +} + +impl ExtraBatchWrite { + #[inline] + fn clear(&mut self) { + match self { + ExtraBatchWrite::None => {} + ExtraBatchWrite::V1(w) => w.clear(), + // No clear in in `RaftLogBatch`. + ExtraBatchWrite::V2(_) => *self = ExtraBatchWrite::None, + } + } + + /// Merge the extra_write with this batch. + /// + /// If there is any new states inserted, return the size of the state. + fn merge(&mut self, extra_write: &mut ExtraWrite) { + match mem::replace(extra_write, ExtraWrite::None) { + ExtraWrite::None => (), + ExtraWrite::V1(wb) => match self { + ExtraBatchWrite::None => *self = ExtraBatchWrite::V1(wb), + ExtraBatchWrite::V1(kv_wb) => kv_wb.merge(wb).unwrap(), + ExtraBatchWrite::V2(_) => unreachable!("v2 and v1 are mixed used"), + }, + ExtraWrite::V2(lb) => match self { + ExtraBatchWrite::None => *self = ExtraBatchWrite::V2(lb), + ExtraBatchWrite::V1(_) => unreachable!("v2 and v1 are mixed used"), + ExtraBatchWrite::V2(raft_wb) => raft_wb.merge(lb).unwrap(), + }, } } } @@ -192,14 +372,20 @@ where EK: KvEngine, ER: RaftEngine, { - pub kv_wb: EK::WriteBatch, - pub raft_wb: ER::LogBatch, - // Write raft state once for a region everytime writing to disk + // When a single batch becomes too large, we uses multiple batches each containing atomic + // writes. + pub raft_wbs: Vec, + // Write states once for a region everytime writing to disk. + // These states only corresponds to entries inside `raft_wbs.last()`. States for other write + // batches must be inlined early. pub raft_states: HashMap, + pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, pub tasks: Vec>, + pub persisted_cbs: Vec>, // region_id -> (peer_id, ready_number) pub readies: HashMap, + pub(crate) raft_wb_split_size: usize, } impl WriteTaskBatch @@ -207,44 +393,64 @@ where EK: KvEngine, ER: RaftEngine, { - fn new(kv_wb: EK::WriteBatch, raft_wb: ER::LogBatch) -> Self { + fn new(raft_wb: ER::LogBatch) -> Self { Self { - kv_wb, - raft_wb, + raft_wbs: vec![raft_wb], raft_states: HashMap::default(), + extra_batch_write: ExtraBatchWrite::None, state_size: 0, tasks: vec![], + persisted_cbs: vec![], readies: HashMap::default(), + raft_wb_split_size: RAFT_WB_SPLIT_SIZE, + } + } + + #[inline] + fn flush_states_to_raft_wb(&mut self) { + let wb = self.raft_wbs.last_mut().unwrap(); + for (region_id, state) in self.raft_states.drain() { + wb.put_raft_state(region_id, &state).unwrap(); + } + self.state_size = 0; + if let ExtraBatchWrite::V2(_) = self.extra_batch_write { + let ExtraBatchWrite::V2(lb) = mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) else { unreachable!() }; + wb.merge(lb).unwrap(); } } /// Add write task to this batch - fn add_write_task(&mut self, mut task: WriteTask) { + fn add_write_task(&mut self, raft_engine: &ER, mut task: WriteTask) { if let Err(e) = task.valid() { panic!("task is not valid: {:?}", e); } - if let Some(kv_wb) = task.kv_wb.take() { - self.kv_wb.merge(kv_wb).unwrap(); - } - if let Some(raft_wb) = task.raft_wb.take() { - self.raft_wb.merge(raft_wb).unwrap(); - } - let entries = std::mem::take(&mut task.entries); - self.raft_wb.append(task.region_id, entries).unwrap(); - if let Some((from, to)) = task.cut_logs { - self.raft_wb.cut_logs(task.region_id, from, to); + if self.raft_wb_split_size > 0 + && self.raft_wbs.last().unwrap().persist_size() >= self.raft_wb_split_size + { + self.flush_states_to_raft_wb(); + self.raft_wbs + .push(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); } - if let Some(raft_state) = task.raft_state.take() { - if self - .raft_states - .insert(task.region_id, raft_state) - .is_none() - { - self.state_size += std::mem::size_of::(); - } + let raft_wb = self.raft_wbs.last_mut().unwrap(); + if let Some(wb) = task.raft_wb.take() { + raft_wb.merge(wb).unwrap(); + } + raft_wb + .append( + task.region_id, + task.overwrite_to, + std::mem::take(&mut task.entries), + ) + .unwrap(); + + if let Some(raft_state) = task.raft_state.take() + && self.raft_states.insert(task.region_id, raft_state).is_none() + { + self.state_size += std::mem::size_of::(); } + self.extra_batch_write.merge(&mut task.extra_write); if let Some(prev_readies) = self .readies @@ -266,14 +472,17 @@ where ); } } - + for v in task.persisted_cbs.drain(..) { + self.persisted_cbs.push(v); + } self.tasks.push(task); } fn clear(&mut self) { - // raft_wb doesn't have clear interface and it should be consumed by raft db before - self.kv_wb.clear(); + // raft_wb doesn't have clear interface and it should be consumed by raft db + // before self.raft_states.clear(); + self.extra_batch_write.clear(); self.state_size = 0; self.tasks.clear(); self.readies.clear(); @@ -286,22 +495,24 @@ where #[inline] fn get_raft_size(&self) -> usize { - self.state_size + self.raft_wb.persist_size() + self.state_size + + self + .raft_wbs + .iter() + .map(|wb| wb.persist_size()) + .sum::() } fn before_write_to_db(&mut self, metrics: &StoreWriteMetrics) { - // Put raft state to raft writebatch - for (region_id, state) in self.raft_states.drain() { - self.raft_wb.put_raft_state(region_id, &state).unwrap(); - } - self.state_size = 0; + self.flush_states_to_raft_wb(); if metrics.waterfall_metrics { - let now = Instant::now(); - for task in &self.tasks { - for t in &task.request_times { - metrics - .wf_before_write - .observe(duration_to_sec(now.saturating_duration_since(*t))); + let now = std::time::Instant::now(); + for task in &mut self.tasks { + for tracker in &mut task.trackers { + tracker.observe(now, &metrics.wf_before_write, |t| { + &mut t.metrics.wf_before_write_nanos + }); + tracker.reset(now); } } } @@ -309,25 +520,31 @@ where fn after_write_to_kv_db(&mut self, metrics: &StoreWriteMetrics) { if metrics.waterfall_metrics { - let now = Instant::now(); + let now = std::time::Instant::now(); for task in &self.tasks { - for t in &task.request_times { - metrics - .wf_kvdb_end - .observe(duration_to_sec(now.saturating_duration_since(*t))); + for tracker in &task.trackers { + tracker.observe(now, &metrics.wf_kvdb_end, |t| { + &mut t.metrics.wf_kvdb_end_nanos + }); } } } } + fn after_write_all(&mut self) { + for hook in mem::take(&mut self.persisted_cbs) { + hook(); + } + } + fn after_write_to_raft_db(&mut self, metrics: &StoreWriteMetrics) { if metrics.waterfall_metrics { - let now = Instant::now(); + let now = std::time::Instant::now(); for task in &self.tasks { - for t in &task.request_times { - metrics - .wf_write_end - .observe(duration_to_sec(now.saturating_duration_since(*t))) + for tracker in &task.trackers { + tracker.observe(now, &metrics.wf_write_end, |t| { + &mut t.metrics.wf_write_end_nanos + }); } } } @@ -338,11 +555,12 @@ pub struct Worker where EK: KvEngine, ER: RaftEngine, - N: Notifier, + N: PersistedNotifier, { store_id: u64, tag: String, - engines: Engines, + raft_engine: ER, + kv_engine: Option, receiver: Receiver>, notifier: N, trans: T, @@ -351,7 +569,7 @@ where raft_write_size_limit: usize, metrics: StoreWriteMetrics, message_metrics: RaftSendMessageMetrics, - perf_context: EK::PerfContext, + perf_context: ER::PerfContext, pending_latency_inspect: Vec<(Instant, Vec)>, } @@ -359,30 +577,28 @@ impl Worker where EK: KvEngine, ER: RaftEngine, - N: Notifier, + N: PersistedNotifier, T: Transport, { pub fn new( store_id: u64, tag: String, - engines: Engines, + raft_engine: ER, + kv_engine: Option, receiver: Receiver>, notifier: N, trans: T, cfg: &Arc>, ) -> Self { - let batch = WriteTaskBatch::new( - engines.kv.write_batch_with_cap(KV_WB_DEFAULT_SIZE), - engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE), - ); - let perf_context = engines - .kv - .get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); + let batch = WriteTaskBatch::new(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); + let perf_context = + ER::get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); Self { store_id, tag, - engines, + raft_engine, + kv_engine, receiver, notifier, trans, @@ -390,7 +606,7 @@ where cfg_tracker, raft_write_size_limit: cfg.value().raft_write_size_limit.0 as usize, metrics: StoreWriteMetrics::new(cfg.value().waterfall_metrics), - message_metrics: Default::default(), + message_metrics: RaftSendMessageMetrics::default(), perf_context, pending_latency_inspect: vec![], } @@ -451,7 +667,7 @@ where "region_id" => task.region_id, "peer_id" => task.peer_id, "ready_number" => task.ready_number, - "kv_wb_size" => task.kv_wb.as_ref().map_or(0, |wb| wb.data_size()), + "extra_write_size" => task.extra_write.data_size(), "raft_wb_size" => task.raft_wb.as_ref().map_or(0, |wb| wb.persist_size()), "entry_count" => task.entries.len(), ); @@ -467,12 +683,16 @@ where } => { self.pending_latency_inspect.push((send_time, inspector)); } + #[cfg(test)] + WriteMsg::Pause(rx) => { + let _ = rx.recv(); + } } false } pub fn handle_write_task(&mut self, task: WriteTask) { - self.batch.add_write_task(task); + self.batch.add_write_task(&self.raft_engine, task); } pub fn write_to_db(&mut self, notify: bool) { @@ -487,53 +707,67 @@ where fail_point!("raft_before_save"); let mut write_kv_time = 0f64; - if !self.batch.kv_wb.is_empty() { - let raft_before_save_kv_on_store_3 = || { - fail_point!("raft_before_save_kv_on_store_3", self.store_id == 3, |_| {}); - }; - raft_before_save_kv_on_store_3(); - let now = Instant::now(); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - // TODO: Add perf context - self.batch.kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { - panic!( - "store {}: {} failed to write to kv engine: {:?}", - self.store_id, self.tag, e - ); - }); - if self.batch.kv_wb.data_size() > KV_WB_SHRINK_SIZE { - self.batch.kv_wb = self.engines.kv.write_batch_with_cap(KV_WB_DEFAULT_SIZE); + if let ExtraBatchWrite::V1(kv_wb) = &mut self.batch.extra_batch_write { + if !kv_wb.is_empty() { + let store_id = self.store_id; + let raft_before_save_kv_on_store_3 = || { + fail_point!("raft_before_save_kv_on_store_3", store_id == 3, |_| {}); + }; + raft_before_save_kv_on_store_3(); + let now = Instant::now(); + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + // TODO: Add perf context + let tag = &self.tag; + kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { + panic!( + "store {}: {} failed to write to kv engine: {:?}", + store_id, tag, e + ); + }); + if kv_wb.data_size() > KV_WB_SHRINK_SIZE { + *kv_wb = self + .kv_engine + .as_ref() + .unwrap() + .write_batch_with_cap(KV_WB_DEFAULT_SIZE); + } + write_kv_time = duration_to_sec(now.saturating_elapsed()); + STORE_WRITE_KVDB_DURATION_HISTOGRAM.observe(write_kv_time); } - write_kv_time = duration_to_sec(now.saturating_elapsed()); - STORE_WRITE_KVDB_DURATION_HISTOGRAM.observe(write_kv_time); + self.batch.after_write_to_kv_db(&self.metrics); } - - self.batch.after_write_to_kv_db(&self.metrics); - fail_point!("raft_between_save"); let mut write_raft_time = 0f64; - if !self.batch.raft_wb.is_empty() { + if !self.batch.raft_wbs[0].is_empty() { fail_point!("raft_before_save_on_store_1", self.store_id == 1, |_| {}); let now = Instant::now(); self.perf_context.start_observe(); - self.engines - .raft - .consume_and_shrink( - &mut self.batch.raft_wb, - true, - RAFT_WB_SHRINK_SIZE, - RAFT_WB_DEFAULT_SIZE, - ) - .unwrap_or_else(|e| { - panic!( - "store {}: {} failed to write to raft engine: {:?}", - self.store_id, self.tag, e - ); - }); - self.perf_context.report_metrics(); + for i in 0..self.batch.raft_wbs.len() { + self.raft_engine + .consume_and_shrink( + &mut self.batch.raft_wbs[i], + true, + RAFT_WB_SHRINK_SIZE, + RAFT_WB_DEFAULT_SIZE, + ) + .unwrap_or_else(|e| { + panic!( + "store {}: {} failed to write to raft engine: {:?}", + self.store_id, self.tag, e + ); + }); + } + self.batch.raft_wbs.truncate(1); + let trackers: Vec<_> = self + .batch + .tasks + .iter() + .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) + .collect(); + self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_RAFTDB_DURATION_HISTOGRAM.observe(write_raft_time); } @@ -542,6 +776,13 @@ where self.batch.after_write_to_raft_db(&self.metrics); + fail_point!( + "async_write_before_cb", + !self.batch.persisted_cbs.is_empty(), + |_| () + ); + self.batch.after_write_all(); + fail_point!("raft_before_follower_send"); let mut now = Instant::now(); @@ -574,11 +815,12 @@ where "error_code" => %e.error_code(), ); self.message_metrics.add(msg_type, false); - // If this msg is snapshot, it is unnecessary to send snapshot - // status to this peer because it has already become follower. - // (otherwise the snapshot msg should be sent in store thread other than here) - // Also, the follower don't need flow control, so don't send - // unreachable msg here. + // If this msg is snapshot, it is unnecessary to send + // snapshot status to this peer because it has already + // become follower. (otherwise the snapshot msg should be + // sent in store thread other than here) Also, the follower + // don't need flow control, so don't send unreachable msg + // here. } else { self.message_metrics.add(msg_type, true); } @@ -595,8 +837,7 @@ where let mut callback_time = 0f64; if notify { for (region_id, (peer_id, ready_number)) in &self.batch.readies { - self.notifier - .notify_persisted(*region_id, *peer_id, *ready_number); + self.notifier.notify(*region_id, *peer_id, *ready_number); } now = Instant::now(); callback_time = duration_to_sec(now.saturating_duration_since(now2)); @@ -645,100 +886,190 @@ where } } -pub struct StoreWriters +#[derive(Clone)] +pub struct StoreWritersContext where EK: KvEngine, ER: RaftEngine, + T: Transport + 'static, + N: PersistedNotifier, { - writers: Vec>>, - handlers: Vec>, + pub store_id: u64, + pub raft_engine: ER, + pub kv_engine: Option, + pub transfer: T, + pub notifier: N, + pub cfg: Arc>, } -impl StoreWriters +#[derive(Clone)] +pub struct StoreWriters where EK: KvEngine, ER: RaftEngine, { - pub fn new() -> Self { + resource_ctl: Option>, + /// Mailboxes for sending raft messages to async ios. + writers: Arc>>, + /// Background threads for handling asynchronous messages. + handlers: Arc>>>, +} + +impl StoreWriters { + pub fn new(resource_ctl: Option>) -> Self { Self { - writers: vec![], - handlers: vec![], + resource_ctl, + writers: Arc::new(VersionTrack::default()), + handlers: Arc::new(Mutex::new(vec![])), } } +} - pub fn senders(&self) -> &Vec>> { - &self.writers +impl StoreWriters +where + EK: KvEngine, + ER: RaftEngine, +{ + pub fn senders(&self) -> WriteSenders { + WriteSenders::new(self.writers.clone()) } - pub fn spawn( + pub fn spawn( &mut self, store_id: u64, - engines: &Engines, + raft_engine: ER, + kv_engine: Option, notifier: &N, trans: &T, cfg: &Arc>, ) -> Result<()> { let pool_size = cfg.value().store_io_pool_size; - for i in 0..pool_size { - let tag = format!("store-writer-{}", i); - let (tx, rx) = bounded(cfg.value().store_io_notify_capacity); - let mut worker = Worker::new( - store_id, - tag.clone(), - engines.clone(), - rx, - notifier.clone(), - trans.clone(), - cfg, - ); - info!("starting store writer {}", i); - let t = thread::Builder::new().name(thd_name!(tag)).spawn(move || { - worker.run(); - })?; - self.writers.push(tx); - self.handlers.push(t); + if pool_size > 0 { + self.increase_to( + pool_size, + StoreWritersContext { + store_id, + notifier: notifier.clone(), + raft_engine, + kv_engine, + transfer: trans.clone(), + cfg: cfg.clone(), + }, + )?; } Ok(()) } pub fn shutdown(&mut self) { - assert_eq!(self.writers.len(), self.handlers.len()); - for (i, handler) in self.handlers.drain(..).enumerate() { + let mut handlers = self.handlers.lock(); + let writers = self.writers.value().get(); + assert_eq!(writers.len(), handlers.len()); + for (i, handler) in handlers.drain(..).enumerate() { info!("stopping store writer {}", i); - self.writers[i].send(WriteMsg::Shutdown).unwrap(); + writers[i].send(WriteMsg::Shutdown, None).unwrap(); handler.join().unwrap(); } } + + #[inline] + /// Returns the valid size of store writers. + pub fn size(&self) -> usize { + self.writers.value().get().len() + } + + pub fn decrease_to(&mut self, size: usize) -> Result<()> { + // Only update logical version of writers but not destroying the workers, so + // that peers that are still using the writer_id (because there're + // unpersisted tasks) can proceed to finish their tasks. After the peer + // gets rescheduled, it will use a new writer_id within the new + // capacity, specified by refreshed `store-io-pool-size`. + // + // TODO: find an elegant way to effectively free workers. + assert_eq!(self.writers.value().get().len(), self.handlers.lock().len()); + self.writers + .update(move |writers: &mut SharedSenders| -> Result<()> { + assert!(writers.get().len() > size); + Ok(()) + })?; + Ok(()) + } + + pub fn increase_to( + &mut self, + size: usize, + writer_meta: StoreWritersContext, + ) -> Result<()> { + let mut handlers = self.handlers.lock(); + let current_size = self.writers.value().get().len(); + assert_eq!(current_size, handlers.len()); + let resource_ctl = self.resource_ctl.clone(); + self.writers + .update(move |writers: &mut SharedSenders| -> Result<()> { + let mut cached_senders = writers.get(); + for i in current_size..size { + let tag = format!("store-writer-{}", i); + let (tx, rx) = bounded( + resource_ctl.clone(), + writer_meta.cfg.value().store_io_notify_capacity, + ); + let mut worker = Worker::new( + writer_meta.store_id, + tag.clone(), + writer_meta.raft_engine.clone(), + writer_meta.kv_engine.clone(), + rx, + writer_meta.notifier.clone(), + writer_meta.transfer.clone(), + &writer_meta.cfg, + ); + info!("starting store writer {}", i); + let t = + thread::Builder::new() + .name(thd_name!(tag)) + .spawn_wrapper(move || { + set_io_type(IoType::ForegroundWrite); + worker.run(); + })?; + cached_senders.push(tx); + handlers.push(t); + } + writers.set(cached_senders); + Ok(()) + })?; + Ok(()) + } } /// Used for test to write task to kv db and raft db. -#[cfg(test)] -pub fn write_to_db_for_test(engines: &Engines, task: WriteTask) -where +pub fn write_to_db_for_test( + engines: &engine_traits::Engines, + task: WriteTask, +) where EK: KvEngine, ER: RaftEngine, { - let mut batch = WriteTaskBatch::new( - engines.kv.write_batch(), - engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE), - ); - batch.add_write_task(task); - batch.before_write_to_db(&StoreWriteMetrics::new(false)); - if !batch.kv_wb.is_empty() { - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - batch.kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { - panic!("test failed to write to kv engine: {:?}", e); - }); - } - if !batch.raft_wb.is_empty() { - engines - .raft - .consume(&mut batch.raft_wb, true) - .unwrap_or_else(|e| { + let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); + batch.add_write_task(&engines.raft, task); + let metrics = StoreWriteMetrics::new(false); + batch.before_write_to_db(&metrics); + if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { + if !kv_wb.is_empty() { + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { + panic!("test failed to write to kv engine: {:?}", e); + }); + } + } + if !batch.raft_wbs[0].is_empty() { + for wb in &mut batch.raft_wbs { + engines.raft.consume(wb, true).unwrap_or_else(|e| { panic!("test failed to write to raft engine: {:?}", e); }); + } } + batch.after_write_to_raft_db(&metrics); + batch.after_write_all(); } #[cfg(test)] diff --git a/components/raftstore/src/store/async_io/write_router.rs b/components/raftstore/src/store/async_io/write_router.rs index 384273a97ad..3669fddd613 100644 --- a/components/raftstore/src/store/async_io/write_router.rs +++ b/components/raftstore/src/store/async_io/write_router.rs @@ -5,6 +5,7 @@ use std::{ mem, + ops::Index, sync::{ atomic::{AtomicUsize, Ordering}, Arc, @@ -12,24 +13,28 @@ use std::{ time::Duration, }; -use crossbeam::channel::{Sender, TrySendError}; +use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; -use tikv_util::{info, time::Instant}; +use resource_control::channel::Sender; +use tikv_util::{ + config::{Tracker, VersionTrack}, + error, info, safe_panic, + time::Instant, +}; use crate::store::{ async_io::write::WriteMsg, config::Config, fsm::store::PollContext, local_metrics::RaftMetrics, metrics::*, }; -const RETRY_SCHEDULE_MILLISECONS: u64 = 10; +const RETRY_SCHEDULE_MILLISECONDS: u64 = 10; pub trait WriteRouterContext where EK: KvEngine, ER: RaftEngine, { - fn write_senders(&self) -> &Vec>>; - fn io_reschedule_concurrent_count(&self) -> &Arc; + fn write_senders(&self) -> &WriteSenders; fn config(&self) -> &Config; fn raft_metrics(&self) -> &RaftMetrics; } @@ -39,14 +44,10 @@ where EK: KvEngine, ER: RaftEngine, { - fn write_senders(&self) -> &Vec>> { + fn write_senders(&self) -> &WriteSenders { &self.write_senders } - fn io_reschedule_concurrent_count(&self) -> &Arc { - &self.io_reschedule_concurrent_count - } - fn config(&self) -> &Config { &self.cfg } @@ -72,6 +73,9 @@ where last_unpersisted: Option, /// Pending write msgs since rescheduling. pending_write_msgs: Vec>, + /// The scheduling priority of the last msg, only valid when priority + /// scheduling is enabled + last_msg_priority: Option, } impl WriteRouter @@ -87,16 +91,22 @@ where next_writer_id: None, last_unpersisted: None, pending_write_msgs: vec![], + last_msg_priority: None, } } - /// Send write msg to write worker or push into inner buffer and wait for rescheduling. + /// Send write msg to write worker or push into inner buffer and wait for + /// rescheduling. pub fn send_write_msg>( &mut self, ctx: &mut C, last_unpersisted: Option, msg: WriteMsg, ) { + if last_unpersisted.is_none() { + // reset when there is no pending write + self.last_msg_priority = None; + } if self.should_send(ctx, last_unpersisted) { self.send(ctx, msg); } else { @@ -105,9 +115,9 @@ where } } - /// If there is some msgs need to be rescheduled, check the new persisted number and - /// sending these msgs to a new write worker if persisted number is greater than - /// `self.last_unpersisted`. + /// If there is some msgs need to be rescheduled, check the new persisted + /// number and sending these msgs to a new write worker if persisted + /// number is greater than `self.last_unpersisted`. pub fn check_new_persisted>( &mut self, ctx: &mut C, @@ -117,8 +127,10 @@ where return; } // The peer must be destroyed after all previous write tasks have been finished. - // So do not worry about a destroyed peer being counted in `io_reschedule_concurrent_count`. - ctx.io_reschedule_concurrent_count() + // So do not worry about a destroyed peer being counted in + // `io_reschedule_concurrent_count`. + ctx.write_senders() + .io_reschedule_concurrent_count .fetch_sub(1, Ordering::SeqCst); STORE_IO_RESCHEDULE_PEER_TOTAL_GAUGE.dec(); @@ -144,10 +156,12 @@ where } } - /// Check if write task can be sent to write worker or pushed into `self.pending_write_msgs`. + /// Check if write task can be sent to write worker or pushed into + /// `self.pending_write_msgs`. /// - /// Returns false if the task should be pushed into `self.pending_write_msgs`. - /// true means the task should be sent to the write worker. + /// Returns false if the task should be pushed into + /// `self.pending_write_msgs`. true means the task should be sent to the + /// write worker. fn should_send>( &mut self, ctx: &mut C, @@ -157,13 +171,14 @@ where if self.last_unpersisted.is_some() { return false; } - if ctx.config().store_io_pool_size <= 1 { - self.writer_id = 0; - return true; - } + // Local senders may not be updated when `store_io_pool_size()` has been + // increased by the `ctx.config().update()`, keep the real size until it's + // updated by `poller.begin()`. + let async_io_pool_size = + std::cmp::min(ctx.write_senders().size(), ctx.config().store_io_pool_size); if last_unpersisted.is_none() { // If no previous pending ready, we can randomly select a new writer worker. - self.writer_id = rand::random::() % ctx.config().store_io_pool_size; + self.writer_id = rand::random::() % async_io_pool_size; self.next_retry_time = Instant::now_coarse() + ctx.config().io_reschedule_hotpot_duration.0; self.next_writer_id = None; @@ -180,8 +195,9 @@ where } if self.next_writer_id.is_none() { // The hot write peers should not be rescheduled entirely. - // So it will not be rescheduled if the random id is the same as the original one. - let new_id = rand::random::() % ctx.config().store_io_pool_size; + // So it will not be rescheduled if the random id is the same as the original + // one. + let new_id = rand::random::() % async_io_pool_size; if new_id == self.writer_id { // Reset the time self.next_retry_time = now + ctx.config().io_reschedule_hotpot_duration.0; @@ -191,10 +207,12 @@ where } // This peer should be rescheduled. // Try to add 1 to `io_reschedule_concurrent_count`. - // The `cfg.io_reschedule_concurrent_max_count` is used for controlling the concurrent count - // of rescheduling peer fsm because rescheduling will introduce performance penalty. + // The `cfg.io_reschedule_concurrent_max_count` is used for controlling the + // concurrent count of rescheduling peer fsm because rescheduling will + // introduce performance penalty. let success = ctx - .io_reschedule_concurrent_count() + .write_senders() + .io_reschedule_concurrent_count .fetch_update(Ordering::SeqCst, Ordering::Relaxed, |c| { if c < ctx.config().io_reschedule_concurrent_max_count { Some(c + 1) @@ -205,26 +223,32 @@ where .is_ok(); if success { STORE_IO_RESCHEDULE_PEER_TOTAL_GAUGE.inc(); - // Rescheduling succeeds. The task should be pushed into `self.pending_write_msgs`. + // Rescheduling succeeds. The task should be pushed into + // `self.pending_write_msgs`. self.last_unpersisted = last_unpersisted; info!("starts io reschedule"; "tag" => &self.tag); false } else { // Rescheduling fails at this time. Retry 10ms later. // The task should be sent to the original write worker. - self.next_retry_time = now + Duration::from_millis(RETRY_SCHEDULE_MILLISECONS); + self.next_retry_time = now + Duration::from_millis(RETRY_SCHEDULE_MILLISECONDS); true } } - fn send>(&self, ctx: &mut C, msg: WriteMsg) { - match ctx.write_senders()[self.writer_id].try_send(msg) { - Ok(()) => (), + fn send>(&mut self, ctx: &mut C, msg: WriteMsg) { + let sender = &ctx.write_senders()[self.writer_id]; + sender.consume_msg_resource(&msg); + // pass the priority of last msg as low bound to make sure all messages of one + // peer are handled sequentially. + match sender.try_send(msg, self.last_msg_priority) { + // TODO: handle last msg priority properly + Ok(priority) => self.last_msg_priority = priority, Err(TrySendError::Full(msg)) => { let now = Instant::now(); - if ctx.write_senders()[self.writer_id].send(msg).is_err() { + if sender.send(msg, self.last_msg_priority).is_err() { // Write threads are destroyed after store threads during shutdown. - panic!("{} failed to send write msg, err: disconnected", self.tag); + safe_panic!("{} failed to send write msg, err: disconnected", self.tag); } ctx.raft_metrics() .write_block_wait @@ -232,44 +256,149 @@ where } Err(TrySendError::Disconnected(_)) => { // Write threads are destroyed after store threads during shutdown. - panic!("{} failed to send write msg, err: disconnected", self.tag); + safe_panic!("{} failed to send write msg, err: disconnected", self.tag); } } } } +/// Safefly shared senders among the controller and raftstore threads. +/// Senders in it can only be accessed by cloning method `senders()`. +/// +/// `Clone` is safe to race with concurrent `Sender.send()` because the +/// `RefCell` field `last_msg_group` in `Sender` is skipped. +#[derive(Clone)] +pub struct SharedSenders(Vec>>); + +impl Default for SharedSenders { + fn default() -> Self { + Self(vec![]) + } +} + +impl SharedSenders { + #[inline] + pub fn get(&self) -> Vec>> { + self.0.clone() + } + + #[inline] + pub fn set(&mut self, senders: Vec>>) { + self.0 = senders; + } +} + +/// All `Sender`s in `SharedSenders` are shared by the global controller +/// thread and raftstore threads. There won't exist concurrent `Sender.send()` +/// calling scenarios among threads on a same `Sender`. +/// On the one hand, th controller thread will not call `Sender.send()` to +/// consume resources to send messages, just updating the size of `Sender`s if +/// `store-io-pool-size` is resized. On the other hand, each raftstore thread +/// just use its local cloned `Sender`s for sending messages and update it at +/// `begin()`, the first stage for processing messages. +/// Therefore, it's safe to manually remain `Send` trait for +/// `SharedSenders`. +/// +/// TODO: use an elegant implementation, such as `Mutex`, to avoid this +/// hack for sharing `Sender`s among multi-threads. +unsafe impl Sync for SharedSenders {} + +/// Senders for asynchronous writes. There can be multiple senders, generally +/// you should use `WriteRouter` to decide which sender to be used. +#[derive(Clone)] +pub struct WriteSenders { + senders: Tracker>, + cached_senders: Vec>>, + io_reschedule_concurrent_count: Arc, +} + +impl WriteSenders { + pub fn new(senders: Arc>>) -> Self { + let cached_senders = senders.value().get(); + WriteSenders { + senders: senders.tracker("async writers' tracker".to_owned()), + cached_senders, + io_reschedule_concurrent_count: Arc::default(), + } + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.cached_senders.is_empty() + } + + #[inline] + pub fn size(&self) -> usize { + self.cached_senders.len() + } + + #[inline] + pub fn refresh(&mut self) { + if let Some(senders) = self.senders.any_new() { + self.cached_senders = senders.get(); + } + } +} + +impl Index for WriteSenders { + type Output = Sender>; + + #[inline] + fn index(&self, index: usize) -> &Sender> { + &self.cached_senders[index] + } +} + #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::thread; - use crossbeam::channel::{bounded, Receiver}; - use engine_test::kv::KvTestEngine; + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; + use resource_control::channel::{bounded, Receiver}; use tikv_util::config::ReadableDuration; use super::*; + pub struct TestContext { + pub senders: WriteSenders, + pub config: Config, + pub raft_metrics: RaftMetrics, + } + + impl WriteRouterContext for TestContext { + fn write_senders(&self) -> &WriteSenders { + &self.senders + } + + fn config(&self) -> &Config { + &self.config + } + + fn raft_metrics(&self) -> &RaftMetrics { + &self.raft_metrics + } + } + struct TestWriteRouter { - receivers: Vec>>, - senders: Vec>>, - io_reschedule_concurrent_count: Arc, - config: Config, - raft_metrics: RaftMetrics, + receivers: Vec>>, + ctx: TestContext, } impl TestWriteRouter { fn new(config: Config) -> Self { let (mut receivers, mut senders) = (vec![], vec![]); for _ in 0..config.store_io_pool_size { - let (tx, rx) = bounded(config.store_io_notify_capacity); + let (tx, rx) = bounded(None, config.store_io_notify_capacity); receivers.push(rx); senders.push(tx); } Self { receivers, - senders, - io_reschedule_concurrent_count: Arc::new(AtomicUsize::new(0)), - config, - raft_metrics: RaftMetrics::new(true), + ctx: TestContext { + senders: WriteSenders::new(Arc::new(VersionTrack::new(SharedSenders(senders)))), + config, + raft_metrics: RaftMetrics::new(true), + }, } } @@ -286,31 +415,17 @@ mod tests { } fn must_same_reschedule_count(&self, count: usize) { - let cnt = self.io_reschedule_concurrent_count.load(Ordering::Relaxed); + let cnt = self + .ctx + .senders + .io_reschedule_concurrent_count + .load(Ordering::Relaxed); if cnt != count { panic!("reschedule count not same, {} != {}", cnt, count); } } } - impl WriteRouterContext for TestWriteRouter { - fn write_senders(&self) -> &Vec>> { - &self.senders - } - - fn io_reschedule_concurrent_count(&self) -> &Arc { - &self.io_reschedule_concurrent_count - } - - fn config(&self) -> &Config { - &self.config - } - - fn raft_metrics(&self) -> &RaftMetrics { - &self.raft_metrics - } - } - #[test] fn test_write_router_no_schedule() { let mut config = Config::new(); @@ -319,10 +434,10 @@ mod tests { config.store_io_pool_size = 4; let mut t = TestWriteRouter::new(config); let mut r = WriteRouter::new("1".to_string()); - r.send_write_msg(&mut t, None, WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, None, WriteMsg::Shutdown); let writer_id = r.writer_id; for _ in 1..10 { - r.send_write_msg(&mut t, Some(10), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(10), WriteMsg::Shutdown); thread::sleep(Duration::from_millis(10)); } assert_eq!(writer_id, r.writer_id); @@ -342,7 +457,7 @@ mod tests { let last_time = r.next_retry_time; thread::sleep(Duration::from_millis(10)); // `writer_id` will be chosen randomly due to `last_unpersisted` is None - r.send_write_msg(&mut t, None, WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, None, WriteMsg::Shutdown); assert!(r.next_retry_time > last_time); assert_eq!(r.next_writer_id, None); assert_eq!(r.last_unpersisted, None); @@ -357,7 +472,7 @@ mod tests { let writer_id = r.writer_id; let timer = Instant::now(); loop { - r.send_write_msg(&mut t, Some(10), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(10), WriteMsg::Shutdown); if let Some(id) = r.next_writer_id { assert!(writer_id != id); assert_eq!(r.last_unpersisted, Some(10)); @@ -375,7 +490,7 @@ mod tests { thread::sleep(Duration::from_millis(10)); } - r.send_write_msg(&mut t, Some(20), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(20), WriteMsg::Shutdown); assert!(r.next_writer_id.is_some()); // `last_unpersisted` should not change assert_eq!(r.last_unpersisted, Some(10)); @@ -384,7 +499,7 @@ mod tests { t.must_same_reschedule_count(1); // No effect due to 9 < `last_unpersisted`(10) - r.check_new_persisted(&mut t, 9); + r.check_new_persisted(&mut t.ctx, 9); assert!(r.next_writer_id.is_some()); assert_eq!(r.last_unpersisted, Some(10)); assert_eq!(r.pending_write_msgs.len(), 2); @@ -392,7 +507,7 @@ mod tests { t.must_same_reschedule_count(1); // Should reschedule and send msg - r.check_new_persisted(&mut t, 10); + r.check_new_persisted(&mut t.ctx, 10); assert_eq!(r.next_writer_id, None); assert_eq!(r.last_unpersisted, None); assert!(r.pending_write_msgs.is_empty()); @@ -400,13 +515,16 @@ mod tests { t.must_same_reschedule_count(0); thread::sleep(Duration::from_millis(10)); - t.io_reschedule_concurrent_count.store(4, Ordering::Relaxed); + t.ctx + .senders + .io_reschedule_concurrent_count + .store(4, Ordering::Relaxed); // Should retry reschedule next time because the limitation of concurrent count. // However it's possible that it will not scheduled due to random // so using loop here. let timer = Instant::now(); loop { - r.send_write_msg(&mut t, Some(30), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(30), WriteMsg::Shutdown); t.must_same_msg_count(r.writer_id, 1); if r.next_writer_id.is_some() { assert_eq!(r.last_unpersisted, None); @@ -421,10 +539,13 @@ mod tests { thread::sleep(Duration::from_millis(10)); } - t.io_reschedule_concurrent_count.store(3, Ordering::Relaxed); - thread::sleep(Duration::from_millis(RETRY_SCHEDULE_MILLISECONS + 2)); + t.ctx + .senders + .io_reschedule_concurrent_count + .store(3, Ordering::Relaxed); + thread::sleep(Duration::from_millis(RETRY_SCHEDULE_MILLISECONDS + 2)); // Should reschedule now - r.send_write_msg(&mut t, Some(40), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(40), WriteMsg::Shutdown); assert!(r.next_writer_id.is_some()); assert_eq!(r.last_unpersisted, Some(40)); t.must_same_msg_count(r.writer_id, 0); diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 97d41824a62..24abf24c4fd 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -1,19 +1,29 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{sync::mpsc, time::Duration}; use collections::HashSet; -use crossbeam::channel::unbounded; +use crossbeam::channel::{unbounded, Receiver, Sender}; use engine_test::{kv::KvTestEngine, new_temp_engine, raft::RaftTestEngine}; -use engine_traits::{Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; -use kvproto::raft_serverpb::RaftMessage; +use engine_traits::{Engines, Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; +use kvproto::{ + raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, + raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}, + resource_manager::{GroupMode, GroupRawResourceSettings, ResourceGroup}, +}; +use resource_control::ResourceGroupManager; use tempfile::Builder; use super::*; use crate::{ - store::{Config, Transport}, + store::{ + async_io::write_router::tests::TestContext, local_metrics::RaftMetrics, + peer_storage::tests::new_entry, Config, Transport, WriteRouter, + }, Result, }; +type TestKvWriteBatch = ::WriteBatch; +type TestRaftLogBatch = ::LogBatch; fn must_have_entries_and_state( raft_engine: &RaftTestEngine, @@ -42,13 +52,6 @@ fn must_have_entries_and_state( } } -fn new_entry(index: u64, term: u64) -> Entry { - let mut e = Entry::default(); - e.set_index(index); - e.set_term(term); - e -} - fn new_raft_state(term: u64, vote: u64, commit: u64, last_index: u64) -> RaftLocalState { let mut raft_state = RaftLocalState::new(); raft_state.mut_hard_state().set_term(term); @@ -63,8 +66,8 @@ struct TestNotifier { tx: Sender<(u64, (u64, u64))>, } -impl Notifier for TestNotifier { - fn notify_persisted(&self, region_id: u64, peer_id: u64, ready_number: u64) { +impl PersistedNotifier for TestNotifier { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64) { self.tx.send((region_id, (peer_id, ready_number))).unwrap() } } @@ -126,7 +129,7 @@ fn must_wait_same_notifies( } let timer = Instant::now(); loop { - match notify_rx.recv() { + match notify_rx.recv_timeout(Duration::from_secs(3)) { Ok((region_id, n)) => { if let Some(n2) = notify_map.get(®ion_id) { if n == *n2 { @@ -153,42 +156,32 @@ fn init_write_batch( engines: &Engines, task: &mut WriteTask, ) { - task.kv_wb = Some(engines.kv.write_batch()); + task.extra_write.ensure_v1(|| engines.kv.write_batch()); task.raft_wb = Some(engines.raft.log_batch(0)); } /// Help function for less code /// Option must not be none -fn put_kv(wb: &mut Option<::WriteBatch>, key: &[u8], value: &[u8]) { - wb.as_mut().unwrap().put(key, value).unwrap(); +fn put_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8], value: &[u8]) { + wb.unwrap().put(key, value).unwrap(); } /// Help function for less code /// Option must not be none -fn delete_kv(wb: &mut Option<::WriteBatch>, key: &[u8]) { - wb.as_mut().unwrap().delete(key).unwrap(); +fn delete_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8]) { + wb.unwrap().delete(key).unwrap(); } /// Simulate kv puts on raft engine. -fn put_raft_kv(wb: &mut Option<::LogBatch>, key: u64) { - wb.as_mut() - .unwrap() - .append(key, vec![new_entry(key, key)]) +fn put_raft_kv(wb: Option<&mut TestRaftLogBatch>, key: u64) { + wb.unwrap() + .append(key, None, vec![new_entry(key, key)]) .unwrap(); } -fn delete_raft_kv( - engine: &RaftTestEngine, - wb: &mut Option<::LogBatch>, - key: u64, -) { +fn delete_raft_kv(engine: &RaftTestEngine, wb: Option<&mut TestRaftLogBatch>, key: u64) { engine - .clean( - key, - key, - &new_raft_state(key, key, key, key), - wb.as_mut().unwrap(), - ) + .clean(key, key, &new_raft_state(key, key, key, key), wb.unwrap()) .unwrap(); } @@ -210,7 +203,7 @@ struct TestWorker { impl TestWorker { fn new(cfg: &Config, engines: &Engines) -> Self { - let (_, task_rx) = unbounded(); + let (_, task_rx) = resource_control::channel::unbounded(None); let (msg_tx, msg_rx) = unbounded(); let trans = TestTransport { tx: msg_tx }; let (notify_tx, notify_rx) = unbounded(); @@ -219,7 +212,8 @@ impl TestWorker { worker: Worker::new( 1, "writer".to_string(), - engines.clone(), + engines.raft.clone(), + Some(engines.kv.clone()), task_rx, notifier, trans, @@ -235,33 +229,51 @@ struct TestWriters { writers: StoreWriters, msg_rx: Receiver, notify_rx: Receiver<(u64, (u64, u64))>, + ctx: TestContext, } impl TestWriters { - fn new(cfg: &Config, engines: &Engines) -> Self { + fn new( + cfg: Config, + engines: &Engines, + resource_manager: Option>, + ) -> Self { let (msg_tx, msg_rx) = unbounded(); let trans = TestTransport { tx: msg_tx }; let (notify_tx, notify_rx) = unbounded(); let notifier = TestNotifier { tx: notify_tx }; - let mut writers = StoreWriters::new(); + let mut writers = StoreWriters::new( + resource_manager + .as_ref() + .map(|m| m.derive_controller("test".into(), false)), + ); writers .spawn( 1, - engines, + engines.raft.clone(), + Some(engines.kv.clone()), ¬ifier, &trans, &Arc::new(VersionTrack::new(cfg.clone())), ) .unwrap(); Self { - writers, msg_rx, notify_rx, + ctx: TestContext { + config: cfg, + raft_metrics: RaftMetrics::new(true), + senders: writers.senders(), + }, + writers, } } - fn write_sender(&self, id: usize) -> &Sender> { - &self.writers.senders()[id] + fn write_sender( + &self, + id: usize, + ) -> resource_control::channel::Sender> { + self.writers.senders()[id].clone() } } @@ -276,8 +288,8 @@ fn test_worker() { let mut task_1 = WriteTask::::new(region_1, 1, 10); init_write_batch(&engines, &mut task_1); - put_kv(&mut task_1.kv_wb, b"kv_k1", b"kv_v1"); - put_raft_kv(&mut task_1.raft_wb, 17); + put_kv(task_1.extra_write.v1_mut(), b"kv_k1", b"kv_v1"); + put_raft_kv(task_1.raft_wb.as_mut(), 17); task_1.entries.append(&mut vec![ new_entry(5, 5), new_entry(6, 5), @@ -287,12 +299,12 @@ fn test_worker() { task_1.raft_state = Some(new_raft_state(5, 123, 6, 8)); task_1.messages.append(&mut vec![RaftMessage::default()]); - t.worker.batch.add_write_task(task_1); + t.worker.batch.add_write_task(&engines.raft, task_1); let mut task_2 = WriteTask::::new(region_2, 2, 15); init_write_batch(&engines, &mut task_2); - put_kv(&mut task_2.kv_wb, b"kv_k2", b"kv_v2"); - put_raft_kv(&mut task_2.raft_wb, 27); + put_kv(task_2.extra_write.v1_mut(), b"kv_k2", b"kv_v2"); + put_raft_kv(task_2.raft_wb.as_mut(), 27); task_2 .entries .append(&mut vec![new_entry(20, 15), new_entry(21, 15)]); @@ -301,23 +313,20 @@ fn test_worker() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.worker.batch.add_write_task(task_2); + t.worker.batch.add_write_task(&engines.raft, task_2); let mut task_3 = WriteTask::::new(region_1, 1, 11); init_write_batch(&engines, &mut task_3); - put_kv(&mut task_3.kv_wb, b"kv_k3", b"kv_v3"); - put_raft_kv(&mut task_3.raft_wb, 37); - delete_raft_kv(&engines.raft, &mut task_3.raft_wb, 17); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); + put_raft_kv(task_3.raft_wb.as_mut(), 37); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); task_3 .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.worker.batch.add_write_task(task_3); + t.worker.batch.add_write_task(&engines.raft, task_3); t.worker.write_to_db(true); @@ -351,6 +360,121 @@ fn test_worker() { must_have_same_count_msg(5, &t.msg_rx); } +#[test] +fn test_worker_split_raft_wb() { + let path = Builder::new().prefix("async-io-worker").tempdir().unwrap(); + let engines = new_temp_engine(&path); + let mut t = TestWorker::new(&Config::default(), &engines); + + let mut run_test = |region_1: u64, region_2: u64, split: (bool, bool)| { + let raft_key_1 = 17 + region_1; + let raft_key_2 = 27 + region_1; + let raft_key_3 = 37 + region_1; + let mut expected_wbs = 1; + + let mut task_1 = WriteTask::::new(region_1, 1, 10); + task_1.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_1 = RaftApplyState::default(); + apply_state_1.set_applied_index(10); + let lb = task_1.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 10, &apply_state_1).unwrap(); + put_raft_kv(task_1.raft_wb.as_mut(), raft_key_1); + task_1.entries.append(&mut vec![ + new_entry(5, 5), + new_entry(6, 5), + new_entry(7, 5), + new_entry(8, 5), + ]); + task_1.raft_state = Some(new_raft_state(5, 123, 6, 8)); + t.worker.batch.add_write_task(&engines.raft, task_1); + + let mut task_2 = WriteTask::::new(region_2, 2, 15); + task_2.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_2 = RaftApplyState::default(); + apply_state_2.set_applied_index(16); + let lb = task_2.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_2, 16, &apply_state_2).unwrap(); + put_raft_kv(task_2.raft_wb.as_mut(), raft_key_2); + task_2 + .entries + .append(&mut vec![new_entry(20, 15), new_entry(21, 15)]); + task_2.raft_state = Some(new_raft_state(15, 234, 20, 21)); + if split.0 { + expected_wbs += 1; + t.worker.batch.raft_wb_split_size = 1; + } else { + t.worker.batch.raft_wb_split_size = 0; + } + t.worker.batch.add_write_task(&engines.raft, task_2); + + let mut task_3 = WriteTask::::new(region_1, 1, 11); + task_3.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_3 = RaftApplyState::default(); + apply_state_3.set_applied_index(25); + let lb = task_3.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 25, &apply_state_3).unwrap(); + put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); + task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); + if split.1 { + expected_wbs += 1; + t.worker.batch.raft_wb_split_size = 1; + } else { + t.worker.batch.raft_wb_split_size = 0; + } + t.worker.batch.add_write_task(&engines.raft, task_3); + + assert_eq!(t.worker.batch.raft_wbs.len(), expected_wbs); + t.worker.write_to_db(true); + assert_eq!(t.worker.batch.raft_wbs.len(), 1); + + must_have_same_notifies(vec![(region_1, (1, 11)), (region_2, (2, 15))], &t.notify_rx); + + assert_eq!(test_raft_kv(&engines.raft, raft_key_1), false); + assert_eq!(test_raft_kv(&engines.raft, raft_key_2), true); + assert_eq!(test_raft_kv(&engines.raft, raft_key_3), true); + + must_have_entries_and_state( + &engines.raft, + vec![ + ( + region_1, + vec![new_entry(5, 5), new_entry(6, 6), new_entry(7, 7)], + new_raft_state(7, 124, 6, 7), + ), + ( + region_2, + vec![new_entry(20, 15), new_entry(21, 15)], + new_raft_state(15, 234, 20, 21), + ), + ], + ); + assert_eq!( + engines.raft.get_apply_state(region_1, 25).unwrap(), + Some(RaftApplyState { + applied_index: 25, + ..Default::default() + }) + ); + assert_eq!( + engines.raft.get_apply_state(region_2, 16).unwrap(), + Some(RaftApplyState { + applied_index: 16, + ..Default::default() + }) + ); + }; + + let mut first_region = 1; + for a in [true, false] { + for b in [true, false] { + run_test(first_region, first_region + 1, (a, b)); + first_region += 10; + } + } +} + #[test] fn test_basic_flow() { let region_1 = 1; @@ -360,12 +484,12 @@ fn test_basic_flow() { let engines = new_temp_engine(&path); let mut cfg = Config::default(); cfg.store_io_pool_size = 2; - let mut t = TestWriters::new(&cfg, &engines); + let mut t = TestWriters::new(cfg, &engines, None); let mut task_1 = WriteTask::::new(region_1, 1, 10); init_write_batch(&engines, &mut task_1); - put_kv(&mut task_1.kv_wb, b"kv_k1", b"kv_v1"); - put_raft_kv(&mut task_1.raft_wb, 17); + put_kv(task_1.extra_write.v1_mut(), b"kv_k1", b"kv_v1"); + put_raft_kv(task_1.raft_wb.as_mut(), 17); task_1 .entries .append(&mut vec![new_entry(5, 5), new_entry(6, 5), new_entry(7, 5)]); @@ -374,12 +498,14 @@ fn test_basic_flow() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(0).send(WriteMsg::WriteTask(task_1)).unwrap(); + t.write_sender(0) + .send(WriteMsg::WriteTask(task_1), None) + .unwrap(); let mut task_2 = WriteTask::::new(2, 2, 20); init_write_batch(&engines, &mut task_2); - put_kv(&mut task_2.kv_wb, b"kv_k2", b"kv_v2"); - put_raft_kv(&mut task_2.raft_wb, 27); + put_kv(task_2.extra_write.v1_mut(), b"kv_k2", b"kv_v2"); + put_raft_kv(task_2.raft_wb.as_mut(), 27); task_2 .entries .append(&mut vec![new_entry(50, 12), new_entry(51, 13)]); @@ -388,22 +514,25 @@ fn test_basic_flow() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(1).send(WriteMsg::WriteTask(task_2)).unwrap(); + t.write_sender(1) + .send(WriteMsg::WriteTask(task_2), None) + .unwrap(); let mut task_3 = WriteTask::::new(region_1, 1, 15); init_write_batch(&engines, &mut task_3); - put_kv(&mut task_3.kv_wb, b"kv_k3", b"kv_v3"); - delete_kv(&mut task_3.kv_wb, b"kv_k1"); - put_raft_kv(&mut task_3.raft_wb, 37); - delete_raft_kv(&engines.raft, &mut task_3.raft_wb, 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); + delete_kv(task_3.extra_write.v1_mut(), b"kv_k1"); + put_raft_kv(task_3.raft_wb.as_mut(), 37); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(0).send(WriteMsg::WriteTask(task_3)).unwrap(); + t.write_sender(0) + .send(WriteMsg::WriteTask(task_3), None) + .unwrap(); must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); @@ -432,7 +561,207 @@ fn test_basic_flow() { ], ); + must_have_same_count_msg(6, &t.msg_rx); + t.writers.shutdown(); +} + +#[test] +fn test_basic_flow_with_states() { + let region_1 = 1; + let region_2 = 2; + + let path = Builder::new() + .prefix("async-io-basic-states") + .tempdir() + .unwrap(); + let engines = new_temp_engine(&path); + let mut cfg = Config::default(); + cfg.store_io_pool_size = 2; + let mut t = TestWriters::new(cfg, &engines, None); + + let mut task_1 = WriteTask::::new(region_1, 1, 10); + task_1.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_1 = RaftApplyState::default(); + apply_state_1.applied_index = 2; + let mut region_state_1 = RegionLocalState::default(); + region_state_1 + .mut_region() + .mut_region_epoch() + .set_version(3); + let lb = task_1.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 2, &apply_state_1).unwrap(); + lb.put_region_state(region_1, 2, ®ion_state_1).unwrap(); + put_raft_kv(task_1.raft_wb.as_mut(), 17); + task_1 + .entries + .append(&mut vec![new_entry(5, 5), new_entry(6, 5), new_entry(7, 5)]); + task_1.raft_state = Some(new_raft_state(5, 234, 6, 7)); + task_1 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + + t.write_sender(0) + .send(WriteMsg::WriteTask(task_1), None) + .unwrap(); + + let mut task_2 = WriteTask::::new(2, 2, 20); + task_2.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_2 = RaftApplyState::default(); + apply_state_2.applied_index = 30; + let lb = task_2.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(2, 30, &apply_state_2).unwrap(); + put_raft_kv(task_2.raft_wb.as_mut(), 27); + task_2 + .entries + .append(&mut vec![new_entry(50, 12), new_entry(51, 13)]); + task_2.raft_state = Some(new_raft_state(13, 567, 49, 51)); + task_2 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + + t.write_sender(1) + .send(WriteMsg::WriteTask(task_2), None) + .unwrap(); + + let mut task_3 = WriteTask::::new(region_1, 1, 15); + task_3.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_3 = RaftApplyState::default(); + apply_state_3.applied_index = 5; + let lb = task_3.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 5, &apply_state_3).unwrap(); + put_raft_kv(task_3.raft_wb.as_mut(), 37); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); + task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); + task_3 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + + t.write_sender(0) + .send(WriteMsg::WriteTask(task_3), None) + .unwrap(); + + must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); + + assert_eq!(test_raft_kv(&engines.raft, 17), false); + assert_eq!(test_raft_kv(&engines.raft, 27), true); + assert_eq!(test_raft_kv(&engines.raft, 37), true); + + must_have_entries_and_state( + &engines.raft, + vec![ + ( + region_1, + vec![new_entry(5, 5), new_entry(6, 6)], + new_raft_state(6, 345, 6, 6), + ), + ( + region_2, + vec![new_entry(50, 12), new_entry(51, 13)], + new_raft_state(13, 567, 49, 51), + ), + ], + ); + assert_eq!( + engines.raft.get_apply_state(region_1, 5).unwrap().unwrap(), + apply_state_3 + ); + assert_eq!( + engines.raft.get_apply_state(region_2, 30).unwrap().unwrap(), + apply_state_2 + ); + assert_eq!( + engines.raft.get_region_state(region_1, 2).unwrap().unwrap(), + region_state_1 + ); + assert_eq!(engines.raft.get_region_state(region_2, 1).unwrap(), None); + must_have_same_count_msg(6, &t.msg_rx); t.writers.shutdown(); } + +#[test] +fn test_resource_group() { + let region_1 = 1; + let region_2 = 2; + + let resource_manager = Arc::new(ResourceGroupManager::default()); + let get_group = |name: &str, read_tokens: u64, write_tokens: u64| -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name.to_string()); + group.set_mode(GroupMode::RawMode); + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + group + }; + resource_manager.add_resource_group(get_group("group1", 10, 10)); + resource_manager.add_resource_group(get_group("group2", 100, 100)); + + let path = Builder::new().prefix("async-io-basic").tempdir().unwrap(); + let engines = new_temp_engine(&path); + let mut cfg = Config::default(); + cfg.store_io_pool_size = 1; + + let mut t = TestWriters::new(cfg, &engines, Some(resource_manager)); + + let (tx, rx) = mpsc::sync_channel(0); + t.write_sender(0).send(WriteMsg::Pause(rx), None).unwrap(); + + let mut r = WriteRouter::new("1".to_string()); + let mut task_1 = WriteTask::::new(region_1, 1, 10); + init_write_batch(&engines, &mut task_1); + put_raft_kv(task_1.raft_wb.as_mut(), 17); + let entries = vec![new_entry(5, 5), new_entry(6, 5), new_entry(7, 5)]; + let mut entries = entries + .into_iter() + .map(|mut e| { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("group1".to_owned()); + req.set_header(header); + e.set_data(req.write_to_bytes().unwrap().into()); + e + }) + .collect(); + task_1.entries.append(&mut entries); + task_1.raft_state = Some(new_raft_state(5, 234, 6, 7)); + task_1 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + r.send_write_msg(&mut t.ctx, None, WriteMsg::WriteTask(task_1)); + + let mut r = WriteRouter::new("2".to_string()); + let mut task_2 = WriteTask::::new(region_2, 2, 20); + init_write_batch(&engines, &mut task_2); + put_raft_kv(task_2.raft_wb.as_mut(), 27); + let entries = vec![new_entry(50, 12), new_entry(51, 13)]; + let mut entries = entries + .into_iter() + .map(|mut e| { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("group2".to_owned()); + req.set_header(header); + e.set_data(req.write_to_bytes().unwrap().into()); + e + }) + .collect(); + task_2.entries.append(&mut entries); + task_2.raft_state = Some(new_raft_state(13, 567, 49, 51)); + task_2 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + r.send_write_msg(&mut t.ctx, None, WriteMsg::WriteTask(task_2)); + + tx.send(()).unwrap(); + must_wait_same_notifies(vec![(region_1, (1, 10)), (region_2, (2, 20))], &t.notify_rx); +} diff --git a/components/raftstore/src/store/bootstrap.rs b/components/raftstore/src/store/bootstrap.rs index 12fb238dce8..249ae4b704f 100644 --- a/components/raftstore/src/store/bootstrap.rs +++ b/components/raftstore/src/store/bootstrap.rs @@ -5,13 +5,10 @@ use kvproto::{ metapb, raft_serverpb::{RaftLocalState, RegionLocalState, StoreIdent}, }; -use tikv_util::{box_err, box_try}; +use tikv_util::{box_err, box_try, store::new_peer}; -use super::{ - peer_storage::{ - write_initial_apply_state, write_initial_raft_state, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, - }, - util::new_peer, +use super::peer_storage::{ + write_initial_apply_state, write_initial_raft_state, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, }; use crate::Result; @@ -34,7 +31,7 @@ fn is_range_empty( end_key: &[u8], ) -> Result { let mut count: u32 = 0; - engine.scan_cf(cf, start_key, end_key, false, |_, _| { + engine.scan(cf, start_key, end_key, false, |_, _| { count += 1; Ok(false) })?; @@ -44,8 +41,8 @@ fn is_range_empty( // Bootstrap the store, the DB for this store must be empty and has no data. // -// FIXME: ER typaram should just be impl KvEngine, but RaftEngine doesn't support -// the `is_range_empty` query yet. +// FIXME: ER typaram should just be impl KvEngine, but RaftEngine doesn't +// support the `is_range_empty` query yet. pub fn bootstrap_store( engines: &Engines, cluster_id: u64, @@ -136,21 +133,17 @@ mod tests { fn test_bootstrap() { let path = Builder::new().prefix("var").tempdir().unwrap(); let raft_path = path.path().join("raft"); - let kv_engine = engine_test::kv::new_engine( - path.path().to_str().unwrap(), - None, - &[CF_DEFAULT, CF_RAFT], - None, - ) - .unwrap(); + let kv_engine = + engine_test::kv::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, CF_RAFT]) + .unwrap(); let raft_engine = engine_test::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(); let engines = Engines::new(kv_engine.clone(), raft_engine.clone()); let region = initial_region(1, 1, 1); - assert!(bootstrap_store(&engines, 1, 1).is_ok()); - assert!(bootstrap_store(&engines, 1, 1).is_err()); + bootstrap_store(&engines, 1, 1).unwrap(); + bootstrap_store(&engines, 1, 1).unwrap_err(); - assert!(prepare_bootstrap_cluster(&engines, ®ion).is_ok()); + prepare_bootstrap_cluster(&engines, ®ion).unwrap(); assert!( kv_engine .get_value(keys::PREPARE_BOOTSTRAP_KEY) @@ -171,8 +164,8 @@ mod tests { ); assert!(raft_engine.get_raft_state(1).unwrap().is_some()); - assert!(clear_prepare_bootstrap_key(&engines).is_ok()); - assert!(clear_prepare_bootstrap_cluster(&engines, 1).is_ok()); + clear_prepare_bootstrap_key(&engines).unwrap(); + clear_prepare_bootstrap_cluster(&engines, 1).unwrap(); assert!( is_range_empty( &kv_engine, diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index dc5690a2b34..efee09be906 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -47,8 +47,8 @@ impl CompactionGuardGeneratorFactory

{ } } -// Update to implement engine_traits::SstPartitionerFactory instead once we move to use abstracted -// ColumnFamilyOptions in src/config.rs. +// Update to implement engine_traits::SstPartitionerFactory instead once we move +// to use abstracted CfOptions in src/config.rs. impl SstPartitionerFactory for CompactionGuardGeneratorFactory

{ @@ -59,9 +59,9 @@ impl SstPartitionerFactory } fn create_partitioner(&self, context: &SstPartitionerContext<'_>) -> Option { - // create_partitioner can be called in RocksDB while holding db_mutex. It can block - // other operations on RocksDB. To avoid such caces, we defer region info query to - // the first time should_partition is called. + // create_partitioner can be called in RocksDB while holding db_mutex. It can + // block other operations on RocksDB. To avoid such cases, we defer + // region info query to the first time should_partition is called. Some(CompactionGuardGenerator { cf_name: self.cf_name, smallest_key: context.smallest_key.to_vec(), @@ -195,15 +195,15 @@ impl SstPartitioner for CompactionGuardGenerator

{ #[cfg(test)] mod tests { - use std::{str, sync::Arc}; + use std::str; use engine_rocks::{ - raw::{BlockBasedOptions, ColumnFamilyOptions, DBCompressionType, DBOptions}, - raw_util::{new_engine_opt, CFOptions}, - RocksEngine, RocksSstPartitionerFactory, RocksSstReader, + raw::{BlockBasedOptions, DBCompressionType}, + util::new_engine_opt, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstPartitionerFactory, RocksSstReader, }; use engine_traits::{ - CompactExt, Iterator, MiscExt, SeekKey, SstReader, SyncMutable, CF_DEFAULT, + CompactExt, IterOptions, Iterator, MiscExt, RefIterable, SstReader, SyncMutable, CF_DEFAULT, }; use keys::DATA_PREFIX_KEY; use kvproto::metapb::Region; @@ -369,7 +369,7 @@ mod tests { fn new_test_db(provider: MockRegionInfoProvider) -> (RocksEngine, TempDir) { let temp_dir = TempDir::new().unwrap(); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_target_file_size_base(MAX_OUTPUT_FILE_SIZE); cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory( CompactionGuardGeneratorFactory::new(CF_DEFAULT, provider, MIN_OUTPUT_FILE_SIZE) @@ -385,26 +385,25 @@ mod tests { DBCompressionType::No, DBCompressionType::No, ]); - // Make block size small to make sure current_output_file_size passed to SstPartitioner - // is accurate. + // Make block size small to make sure current_output_file_size passed to + // SstPartitioner is accurate. let mut block_based_opts = BlockBasedOptions::new(); block_based_opts.set_block_size(100); cf_opts.set_block_based_table_factory(&block_based_opts); - let db = RocksEngine::from_db(Arc::new( - new_engine_opt( - temp_dir.path().to_str().unwrap(), - DBOptions::new(), - vec![CFOptions::new(CF_DEFAULT, cf_opts)], - ) - .unwrap(), - )); + let db = new_engine_opt( + temp_dir.path().to_str().unwrap(), + RocksDbOptions::default(), + vec![(CF_DEFAULT, cf_opts)], + ) + .unwrap(); (db, temp_dir) } fn collect_keys(path: &str) -> Vec> { - let mut sst_reader = RocksSstReader::open(path).unwrap().iter(); - let mut valid = sst_reader.seek(SeekKey::Start).unwrap(); + let reader = RocksSstReader::open(path).unwrap(); + let mut sst_reader = reader.iter(IterOptions::default()).unwrap(); + let mut valid = sst_reader.seek_to_first().unwrap(); let mut ret = vec![]; while valid { ret.push(sst_reader.key().to_owned()); @@ -441,26 +440,26 @@ mod tests { assert_eq!(b"z", DATA_PREFIX_KEY); // Create two overlapping SST files then force compaction. - // Region "a" will share a SST file with region "b", since region "a" is too small. - // Region "c" will be splitted into two SSTs, since its size is larger than - // target_file_size_base. + // Region "a" will share a SST file with region "b", since region "a" is too + // small. Region "c" will be splitted into two SSTs, since its size is + // larger than target_file_size_base. let value = vec![b'v'; 1024]; db.put(b"za1", b"").unwrap(); db.put(b"zb1", &value).unwrap(); db.put(b"zc1", &value).unwrap(); - db.flush(true /*sync*/).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); db.put(b"zb2", &value).unwrap(); db.put(b"zc2", &value).unwrap(); db.put(b"zc3", &value).unwrap(); db.put(b"zc4", &value).unwrap(); db.put(b"zc5", &value).unwrap(); db.put(b"zc6", &value).unwrap(); - db.flush(true /*sync*/).unwrap(); - db.compact_range( - CF_DEFAULT, None, /*start_key*/ - None, /*end_key*/ - false, /*exclusive_manual*/ - 1, /*max_subcompactions*/ + db.flush_cfs(&[], true /* wait */).unwrap(); + db.compact_range_cf( + CF_DEFAULT, None, // start_key + None, // end_key + false, // exclusive_manual + 1, // max_subcompactions ) .unwrap(); diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 87b299d4cbb..aabf173e674 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -37,7 +37,8 @@ with_prefix!(prefix_store "store-"); #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { - // minimizes disruption when a partitioned node rejoins the cluster by using a two phase election. + // minimizes disruption when a partitioned node rejoins the cluster by using a two phase + // election. #[online_config(skip)] pub prevote: bool, #[online_config(skip)] @@ -67,6 +68,9 @@ pub struct Config { pub raft_log_compact_sync_interval: ReadableDuration, // Interval to gc unnecessary raft log. pub raft_log_gc_tick_interval: ReadableDuration, + // Interval to request voter_replicated_index for gc unnecessary raft log, + // if the leader has not initiated gc for a long time. + pub request_voter_replicated_index_interval: ReadableDuration, // A threshold to gc stale raft log, must >= 1. pub raft_log_gc_threshold: u64, // When entry count exceed this value, gc will be forced trigger. @@ -120,12 +124,13 @@ pub struct Config { /// the peer is considered to be down and is reported to PD. pub max_peer_down_duration: ReadableDuration, - /// If the leader of a peer is missing for longer than max_leader_missing_duration, - /// the peer would ask pd to confirm whether it is valid in any region. - /// If the peer is stale and is not valid in any region, it will destroy itself. + /// If the leader of a peer is missing for longer than + /// max_leader_missing_duration, the peer would ask pd to confirm + /// whether it is valid in any region. If the peer is stale and is not + /// valid in any region, it will destroy itself. pub max_leader_missing_duration: ReadableDuration, - /// Similar to the max_leader_missing_duration, instead it will log warnings and - /// try to alert monitoring systems, if there is any. + /// Similar to the max_leader_missing_duration, instead it will log warnings + /// and try to alert monitoring systems, if there is any. pub abnormal_leader_missing_duration: ReadableDuration, pub peer_stale_state_check_interval: ReadableDuration, @@ -135,6 +140,17 @@ pub struct Config { #[online_config(skip)] pub snap_apply_batch_size: ReadableSize, + // used to periodically check whether schedule pending applies in region runner + #[doc(hidden)] + #[online_config(skip)] + pub region_worker_tick_interval: ReadableDuration, + + // used to periodically check whether we should delete a stale peer's range in + // region runner + #[doc(hidden)] + #[online_config(skip)] + pub clean_stale_ranges_tick: usize, + // Interval (ms) to check region whether the data is consistent. pub consistency_check_interval: ReadableDuration, @@ -156,11 +172,11 @@ pub struct Config { #[online_config(hidden)] pub right_derive_when_split: bool, - /// This setting can only ensure conf remove will not be proposed by the peer - /// being removed. But it can't guarantee the remove is applied when the target - /// is not leader. That means we always need to check if it's working as expected - /// when a leader applies a self-remove conf change. Keep the configuration only - /// for convenient test. + /// This setting can only ensure conf remove will not be proposed by the + /// peer being removed. But it can't guarantee the remove is applied + /// when the target is not leader. That means we always need to check if + /// it's working as expected when a leader applies a self-remove conf + /// change. Keep the configuration only for convenient test. #[cfg(any(test, feature = "testexport"))] pub allow_remove_leader: bool, @@ -190,7 +206,6 @@ pub struct Config { pub store_batch_system: BatchSystemConfig, /// If it is 0, it means io tasks are handled in store threads. - #[online_config(skip)] pub store_io_pool_size: usize, #[online_config(skip)] @@ -205,6 +220,14 @@ pub struct Config { pub dev_assert: bool, #[online_config(hidden)] pub apply_yield_duration: ReadableDuration, + /// yield the fsm when apply flushed data size exceeds this threshold. + /// the yield is check after commit, so the actual handled messages can be + /// bigger than the configed value. + // NOTE: the default value is much smaller than the default max raft batch msg size(0.2 + // * raft_entry_max_size), this is intentional because in the common case, a raft entry + // is unlikely to exceed this threshold, but in case when raftstore is the bottleneck, + // we still allow big raft batch for better throughput. + pub apply_yield_write_size: ReadableSize, #[serde(with = "perf_level_serde")] #[online_config(skip)] @@ -213,9 +236,10 @@ pub struct Config { #[doc(hidden)] #[online_config(skip)] /// Disable this feature by set to 0, logic will be removed in other pr. - /// When TiKV memory usage reaches `memory_usage_high_water` it will try to limit memory - /// increasing. For raftstore layer entries will be evicted from entry cache, if they - /// utilize memory more than `evict_cache_on_memory_ratio` * total. + /// When TiKV memory usage reaches `memory_usage_high_water` it will try to + /// limit memory increasing. For raftstore layer entries will be evicted + /// from entry cache, if they utilize memory more than + /// `evict_cache_on_memory_ratio` * total. /// /// Set it to 0 can disable cache evict. // By default it's 0.2. So for different system memory capacity, cache evict happens: @@ -226,13 +250,14 @@ pub struct Config { pub cmd_batch: bool, - /// When the count of concurrent ready exceeds this value, command will not be proposed - /// until the previous ready has been persisted. + /// When the count of concurrent ready exceeds this value, command will not + /// be proposed until the previous ready has been persisted. /// If `cmd_batch` is 0, this config will have no effect. /// If it is 0, it means no limit. pub cmd_batch_concurrent_ready_max_count: usize, - /// When the size of raft db writebatch exceeds this value, write will be triggered. + /// When the size of raft db writebatch exceeds this value, write will be + /// triggered. pub raft_write_size_limit: ReadableSize, pub waterfall_metrics: bool, @@ -256,7 +281,8 @@ pub struct Config { #[serde(skip_serializing)] #[online_config(skip)] pub region_split_size: ReadableSize, - // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot sequence number. + // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot + // sequence number. #[doc(hidden)] #[serde(skip_serializing)] #[online_config(skip)] @@ -265,19 +291,57 @@ pub struct Config { // Interval to inspect the latency of raftstore for slow store detection. pub inspect_interval: ReadableDuration, + // The unsensitive(increase it to reduce sensitiveness) of the cause-trend detection + pub slow_trend_unsensitive_cause: f64, + // The unsensitive(increase it to reduce sensitiveness) of the result-trend detection + pub slow_trend_unsensitive_result: f64, + // Interval to report min resolved ts, if it is zero, it means disabled. pub report_min_resolved_ts_interval: ReadableDuration, - /// Interval to check whether to reactivate in-memory pessimistic lock after being disabled - /// before transferring leader. + /// Interval to check whether to reactivate in-memory pessimistic lock after + /// being disabled before transferring leader. pub reactive_memory_lock_tick_interval: ReadableDuration, /// Max tick count before reactivating in-memory pessimistic lock. pub reactive_memory_lock_timeout_tick: usize, // Interval of scheduling a tick to report region buckets. pub report_region_buckets_tick_interval: ReadableDuration, + /// Interval to check long uncommitted proposals. + #[doc(hidden)] + pub check_long_uncommitted_interval: ReadableDuration, + /// Base threshold of long uncommitted proposal. + #[doc(hidden)] + pub long_uncommitted_base_threshold: ReadableDuration, + + /// Max duration for the entry cache to be warmed up. + /// Set it to 0 to disable warmup. + pub max_entry_cache_warmup_duration: ReadableDuration, + #[doc(hidden)] pub max_snapshot_file_raw_size: ReadableSize, + + pub unreachable_backoff: ReadableDuration, + + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(hidden)] + // Interval to check peers availability info. + pub check_peers_availability_interval: ReadableDuration, + + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(hidden)] + // Interval to check if need to request snapshot. + pub check_request_snapshot_interval: ReadableDuration, + + /// Make raftstore v1 learners compatible with raftstore v2 by: + /// * Recving tablet snapshot from v2. + /// * Responsing GcPeerRequest from v2. + #[doc(hidden)] + #[online_config(hidden)] + #[serde(alias = "enable-partitioned-raft-kv-compatible-learner")] + pub enable_v2_compatible_learner: bool, } impl Default for Config { @@ -296,6 +360,7 @@ impl Default for Config { raft_entry_max_size: ReadableSize::mb(8), raft_log_compact_sync_interval: ReadableDuration::secs(2), raft_log_gc_tick_interval: ReadableDuration::secs(3), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 50, raft_log_gc_count_limit: None, raft_log_gc_size_limit: None, @@ -321,6 +386,12 @@ impl Default for Config { peer_stale_state_check_interval: ReadableDuration::minutes(5), leader_transfer_max_log_lag: 128, snap_apply_batch_size: ReadableSize::mb(10), + region_worker_tick_interval: if cfg!(feature = "test") { + ReadableDuration::millis(200) + } else { + ReadableDuration::millis(1000) + }, + clean_stale_ranges_tick: if cfg!(feature = "test") { 1 } else { 10 }, lock_cf_compact_interval: ReadableDuration::minutes(10), lock_cf_compact_bytes_threshold: ReadableSize::mb(256), // Disable consistency check by default as it will hurt performance. @@ -345,6 +416,7 @@ impl Default for Config { hibernate_regions: true, dev_assert: false, apply_yield_duration: ReadableDuration::millis(500), + apply_yield_write_size: ReadableSize::kb(32), perf_level: PerfLevel::Uninitialized, evict_cache_on_memory_ratio: 0.0, cmd_batch: true, @@ -356,17 +428,36 @@ impl Default for Config { raft_msg_flush_interval: ReadableDuration::micros(250), reactive_memory_lock_tick_interval: ReadableDuration::secs(2), reactive_memory_lock_timeout_tick: 5, + check_long_uncommitted_interval: ReadableDuration::secs(10), + /// In some cases, such as rolling upgrade, some regions' commit log + /// duration can be 12 seconds. Before #13078 is merged, + /// the commit log duration can be 2.8 minutes. So maybe + /// 20s is a relatively reasonable base threshold. Generally, + /// the log commit duration is less than 1s. Feel free to adjust + /// this config :) + long_uncommitted_base_threshold: ReadableDuration::secs(20), + max_entry_cache_warmup_duration: ReadableDuration::secs(1), // They are preserved for compatibility check. region_max_size: ReadableSize(0), region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), inspect_interval: ReadableDuration::millis(500), - report_min_resolved_ts_interval: ReadableDuration::millis(0), + // The param `slow_trend_unsensitive_cause == 2.0` can yield good results, + // make it `10.0` to reduce a bit sensitiveness because SpikeFilter is disabled + slow_trend_unsensitive_cause: 10.0, + slow_trend_unsensitive_result: 0.5, + report_min_resolved_ts_interval: ReadableDuration::secs(1), check_leader_lease_interval: ReadableDuration::secs(0), renew_leader_lease_advance_duration: ReadableDuration::secs(0), report_region_buckets_tick_interval: ReadableDuration::secs(10), max_snapshot_file_raw_size: ReadableSize::mb(100), + unreachable_backoff: ReadableDuration::secs(10), + // TODO: make its value reasonable + check_peers_availability_interval: ReadableDuration::secs(30), + // TODO: make its value reasonable + check_request_snapshot_interval: ReadableDuration::minutes(1), + enable_v2_compatible_learner: false, } } } @@ -376,6 +467,24 @@ impl Config { Config::default() } + pub fn new_raft_config(&self, peer_id: u64, applied_index: u64) -> raft::Config { + raft::Config { + id: peer_id, + election_tick: self.raft_election_timeout_ticks, + heartbeat_tick: self.raft_heartbeat_ticks, + min_election_tick: self.raft_min_election_timeout_ticks, + max_election_tick: self.raft_max_election_timeout_ticks, + max_size_per_msg: self.raft_max_size_per_msg.0, + max_inflight_msgs: self.raft_max_inflight_msgs, + applied: applied_index, + check_quorum: true, + skip_bcast_commit: true, + pre_vote: self.prevote, + max_committed_size_per_ready: ReadableSize::mb(16).0, + ..Default::default() + } + } + pub fn raft_store_max_leader_lease(&self) -> TimeDuration { TimeDuration::from_std(self.raft_store_max_leader_lease.0).unwrap() } @@ -404,6 +513,11 @@ impl Config { self.raft_log_gc_size_limit.unwrap() } + #[inline] + pub fn warmup_entry_cache_enabled(&self) -> bool { + self.max_entry_cache_warmup_duration.0 != Duration::from_secs(0) + } + pub fn region_split_check_diff(&self) -> ReadableSize { self.region_split_check_diff.unwrap() } @@ -460,8 +574,8 @@ impl Config { )); } - // The adjustment of this value is related to the number of regions, usually 16384 is - // already a large enough value + // The adjustment of this value is related to the number of regions, usually + // 16384 is already a large enough value if self.raft_max_inflight_msgs == 0 || self.raft_max_inflight_msgs > 16384 { return Err(box_err!( "raft max inflight msgs should be greater than 0 and less than or equal to 16384" @@ -490,7 +604,7 @@ impl Config { let election_timeout = self.raft_base_tick_interval.as_millis() * self.raft_election_timeout_ticks as u64; - let lease = self.raft_store_max_leader_lease.as_millis() as u64; + let lease = self.raft_store_max_leader_lease.as_millis(); if election_timeout < lease { return Err(box_err!( "election timeout {} ms is less than lease {} ms", @@ -499,7 +613,7 @@ impl Config { )); } - let tick = self.raft_base_tick_interval.as_millis() as u64; + let tick = self.raft_base_tick_interval.as_millis(); if lease > election_timeout - tick { return Err(box_err!( "lease {} ms should not be greater than election timeout {} ms - 1 tick({} ms)", @@ -513,7 +627,7 @@ impl Config { return Err(box_err!("raftstore.merge-check-tick-interval can't be 0.")); } - let stale_state_check = self.peer_stale_state_check_interval.as_millis() as u64; + let stale_state_check = self.peer_stale_state_check_interval.as_millis(); if stale_state_check < election_timeout * 2 { return Err(box_err!( "peer stale state check interval {} ms is less than election timeout x 2 {} ms", @@ -528,7 +642,7 @@ impl Config { )); } - let abnormal_leader_missing = self.abnormal_leader_missing_duration.as_millis() as u64; + let abnormal_leader_missing = self.abnormal_leader_missing_duration.as_millis(); if abnormal_leader_missing < stale_state_check { return Err(box_err!( "abnormal leader missing {} ms is less than peer stale state check interval {} ms", @@ -537,7 +651,7 @@ impl Config { )); } - let max_leader_missing = self.max_leader_missing_duration.as_millis() as u64; + let max_leader_missing = self.max_leader_missing_duration.as_millis(); if max_leader_missing < abnormal_leader_missing { return Err(box_err!( "max leader missing {} ms is less than abnormal leader missing {} ms", @@ -563,7 +677,7 @@ impl Config { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as usize * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as usize; if self.apply_batch_system.pool_size == 0 || self.apply_batch_system.pool_size > limit { return Err(box_err!( "apply-pool-size should be greater than 0 and less than or equal to: {}", @@ -728,6 +842,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_tick_interval"]) .set(self.raft_log_gc_tick_interval.as_secs_f64()); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["request_voter_replicated_index_interval"]) + .set(self.request_voter_replicated_index_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_threshold"]) .set(self.raft_log_gc_threshold as f64); @@ -840,6 +957,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["local_read_batch_size"]) .set(self.local_read_batch_size as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["apply_yield_write_size"]) + .set(self.apply_yield_write_size.0 as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["apply_max_batch_size"]) .set(self.apply_batch_system.max_batch_size() as f64); @@ -946,8 +1066,25 @@ impl ConfigManager for RaftstoreConfigManager { ) -> std::result::Result<(), Box> { { let change = change.clone(); - self.config - .update(move |cfg: &mut Config| cfg.update(change)); + self.config.update(move |cfg: &mut Config| { + // Currently, it's forbidden to modify the write mode either from `async` to + // `sync` or from `sync` to `async`. + if let Some(ConfigValue::Usize(resized_io_size)) = change.get("store_io_pool_size") + { + if cfg.store_io_pool_size == 0 && *resized_io_size > 0 { + return Err( + "SYNC mode, not allowed to resize the size of store-io-pool-size" + .into(), + ); + } else if cfg.store_io_pool_size > 0 && *resized_io_size == 0 { + return Err( + "ASYNC mode, not allowed to be set to SYNC mode by resizing store-io-pool-size to 0" + .into(), + ); + } + } + cfg.update(change) + })?; } if let Some(ConfigValue::Module(raft_batch_system_change)) = change.get("store_batch_system") @@ -959,6 +1096,12 @@ impl ConfigManager for RaftstoreConfigManager { { self.schedule_config_change(RaftStoreBatchComponent::Apply, apply_batch_system_change); } + if let Some(ConfigValue::Usize(resized_io_size)) = change.get("store_io_pool_size") { + let resize_io_task = RefreshConfigTask::ScaleWriters(*resized_io_size); + if let Err(e) = self.scheduler.schedule(resize_io_task) { + error!("raftstore configuration manager schedule to resize store-io-pool-size work task failed"; "err"=> ?e); + } + } info!( "raftstore config changed"; "change" => ?change, @@ -975,7 +1118,7 @@ mod tests { #[test] fn test_config_validate() { - let split_size = ReadableSize::mb(coprocessor::config::SPLIT_SIZE_MB); + let split_size = coprocessor::config::SPLIT_SIZE; let mut cfg = Config::new(); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!( @@ -988,12 +1131,14 @@ mod tests { ); cfg.raft_heartbeat_ticks = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_election_timeout_ticks = 10; cfg.raft_heartbeat_ticks = 10; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_min_election_timeout_ticks = 5; @@ -1006,100 +1151,112 @@ mod tests { cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg.raft_heartbeat_ticks = 11; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_threshold = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_size_limit = Some(ReadableSize(0)); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_size_limit = None; - assert!( - cfg.validate(ReadableSize(20), false, ReadableSize(0)) - .is_ok() - ); + cfg.validate(ReadableSize(20), false, ReadableSize(0)) + .unwrap(); assert_eq!(cfg.raft_log_gc_size_limit, Some(ReadableSize(15))); cfg = Config::new(); cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 10; cfg.raft_store_max_leader_lease = ReadableDuration::secs(20); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_count_limit = Some(100); cfg.merge_max_log_gap = 110; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_count_limit = None; - assert!( - cfg.validate(ReadableSize::mb(1), false, ReadableSize(0)) - .is_ok() - ); + cfg.validate(ReadableSize::mb(1), false, ReadableSize(0)) + .unwrap(); assert_eq!(cfg.raft_log_gc_count_limit, Some(768)); cfg = Config::new(); cfg.merge_check_tick_interval = ReadableDuration::secs(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 10; cfg.peer_stale_state_check_interval = ReadableDuration::secs(5); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.peer_stale_state_check_interval = ReadableDuration::minutes(2); cfg.abnormal_leader_missing_duration = ReadableDuration::minutes(1); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.abnormal_leader_missing_duration = ReadableDuration::minutes(2); cfg.max_leader_missing_duration = ReadableDuration::minutes(1); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.local_read_batch_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.max_batch_size = Some(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.pool_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.max_batch_size = Some(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.pool_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.max_batch_size = Some(10241); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.max_batch_size = Some(10241); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.hibernate_regions = true; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(256)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(256)); cfg = Config::new(); cfg.hibernate_regions = false; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(1024)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(256)); @@ -1107,62 +1264,69 @@ mod tests { cfg.hibernate_regions = true; cfg.store_batch_system.max_batch_size = Some(123); cfg.apply_batch_system.max_batch_size = Some(234); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(123)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(234)); cfg = Config::new(); cfg.future_poll_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.snap_generator_pool_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 11; cfg.raft_store_max_leader_lease = ReadableDuration::secs(11); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.hibernate_regions = true; cfg.max_peer_down_duration = ReadableDuration::minutes(5); cfg.peer_stale_state_check_interval = ReadableDuration::minutes(5); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.max_peer_down_duration, ReadableDuration::minutes(10)); cfg = Config::new(); cfg.raft_max_size_per_msg = ReadableSize(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_max_size_per_msg = ReadableSize::gb(64); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_max_size_per_msg = ReadableSize::gb(3); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); cfg.raft_entry_max_size = ReadableSize(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_entry_max_size = ReadableSize::mb(3073); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_entry_max_size = ReadableSize::gb(3); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 16); cfg = Config::new(); - assert!(cfg.validate(split_size, true, split_size / 8).is_ok()); + cfg.validate(split_size, true, split_size / 8).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 16); cfg = Config::new(); - assert!(cfg.validate(split_size, true, split_size / 20).is_ok()); + cfg.validate(split_size, true, split_size / 20).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 20); cfg = Config::new(); cfg.region_split_check_diff = Some(ReadableSize(1)); - assert!(cfg.validate(split_size, true, split_size / 20).is_ok()); + cfg.validate(split_size, true, split_size / 20).unwrap(); assert_eq!(cfg.region_split_check_diff(), ReadableSize(1)); } } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs new file mode 100644 index 00000000000..f5226961a6c --- /dev/null +++ b/components/raftstore/src/store/entry_storage.rs @@ -0,0 +1,1866 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the implementation of the `EntryStorage`, which covers +//! a subset of raft storage. This module will be shared between raftstore v1 +//! and v2. + +use std::{ + cell::{Cell, RefCell}, + cmp, + collections::VecDeque, + mem, + ops::Range, + sync::{Arc, Mutex}, + time::Duration, +}; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine, RAFT_LOG_MULTI_GET_CNT}; +use fail::fail_point; +use kvproto::{ + metapb, + raft_serverpb::{RaftApplyState, RaftLocalState}, +}; +use protobuf::Message; +use raft::{prelude::*, util::limit_size, GetEntriesContext, StorageError, INVALID_INDEX}; +use tikv_alloc::TraceEvent; +use tikv_util::{box_err, debug, error, info, time::Instant, warn, worker::Scheduler}; + +use super::{ + metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, + RAFT_INIT_LOG_TERM, +}; +use crate::{bytes_capacity, store::ReadTask, Result}; + +const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; +const SHRINK_CACHE_CAPACITY: usize = 64; +const ENTRY_MEM_SIZE: usize = mem::size_of::(); + +pub const MAX_WARMED_UP_CACHE_KEEP_TIME: Duration = Duration::from_secs(10); +pub const MAX_INIT_ENTRY_COUNT: usize = 1024; + +#[inline] +pub fn first_index(state: &RaftApplyState) -> u64 { + state.get_truncated_state().get_index() + 1 +} + +#[inline] +pub fn last_index(state: &RaftLocalState) -> u64 { + state.get_last_index() +} + +/// Committed entries sent to apply threads. +#[derive(Clone)] +pub struct CachedEntries { + pub range: Range, + // Entries and dangle size for them. `dangle` means not in entry cache. + entries: Arc, usize)>>, +} + +impl CachedEntries { + pub fn new(entries: Vec) -> Self { + assert!(!entries.is_empty()); + let start = entries.first().map(|x| x.index).unwrap(); + let end = entries.last().map(|x| x.index).unwrap() + 1; + let range = Range { start, end }; + CachedEntries { + entries: Arc::new(Mutex::new((entries, 0))), + range, + } + } + + pub fn iter_entries(&self, mut f: impl FnMut(&Entry)) { + let entries = self.entries.lock().unwrap(); + for entry in &entries.0 { + f(entry); + } + } + + /// Take cached entries and dangle size for them. `dangle` means not in + /// entry cache. + pub fn take_entries(&self) -> (Vec, usize) { + mem::take(&mut *self.entries.lock().unwrap()) + } +} + +struct EntryCache { + // The last index of persisted entry. + // It should be equal to `RaftLog::persisted`. + persisted: u64, + cache: VecDeque, + trace: VecDeque, + hit: Cell, + miss: Cell, + #[cfg(test)] + size_change_cb: Option>, +} + +impl EntryCache { + fn first_index(&self) -> Option { + self.cache.front().map(|e| e.get_index()) + } + + fn fetch_entries_to( + &self, + begin: u64, + end: u64, + mut fetched_size: u64, + max_size: u64, + ents: &mut Vec, + ) { + if begin >= end { + return; + } + assert!(!self.cache.is_empty()); + let cache_low = self.cache.front().unwrap().get_index(); + let start_idx = begin.checked_sub(cache_low).unwrap() as usize; + let limit_idx = end.checked_sub(cache_low).unwrap() as usize; + + let mut end_idx = start_idx; + self.cache + .iter() + .skip(start_idx) + .take_while(|e| { + let cur_idx = end_idx as u64 + cache_low; + assert_eq!(e.get_index(), cur_idx); + let m = u64::from(e.compute_size()); + fetched_size += m; + if fetched_size == m { + end_idx += 1; + fetched_size <= max_size && end_idx < limit_idx + } else if fetched_size <= max_size { + end_idx += 1; + end_idx < limit_idx + } else { + false + } + }) + .count(); + // Cache either is empty or contains latest log. Hence we don't need to fetch + // log from rocksdb anymore. + assert!(end_idx == limit_idx || fetched_size > max_size); + let (first, second) = tikv_util::slices_in_range(&self.cache, start_idx, end_idx); + ents.extend_from_slice(first); + ents.extend_from_slice(second); + } + + fn append(&mut self, region_id: u64, peer_id: u64, entries: &[Entry]) { + if !entries.is_empty() { + let mut mem_size_change = 0; + let old_capacity = self.cache.capacity(); + mem_size_change += self.append_impl(region_id, peer_id, entries); + let new_capacity = self.cache.capacity(); + mem_size_change += Self::cache_vec_mem_size_change(new_capacity, old_capacity); + mem_size_change += self.shrink_if_necessary(); + self.flush_mem_size_change(mem_size_change); + } + } + + /// Push entries to the left of the cache. + /// + /// When cache is not empty, the index of the last entry in entries + /// should be equal to `cache first index - 1`. When cache is + /// empty, it should be equal to the store's last index. Otherwise, + /// append new entries may fail due to unexpected hole. + fn prepend(&mut self, entries: Vec) { + let mut mem_size_change = 0; + let old_capacity = self.cache.capacity(); + for e in entries.into_iter().rev() { + mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; + self.cache.push_front(e); + } + let new_capacity = self.cache.capacity(); + mem_size_change += Self::cache_vec_mem_size_change(new_capacity, old_capacity); + mem_size_change += self.shrink_if_necessary(); + self.flush_mem_size_change(mem_size_change); + } + + fn append_impl(&mut self, region_id: u64, peer_id: u64, entries: &[Entry]) -> i64 { + let mut mem_size_change = 0; + + if let Some(cache_last_index) = self.cache.back().map(|e| e.get_index()) { + let first_index = entries[0].get_index(); + if cache_last_index >= first_index { + let cache_len = self.cache.len(); + let truncate_to = cache_len + .checked_sub((cache_last_index - first_index + 1) as usize) + .unwrap_or_default(); + let trunc_to_idx = self.cache[truncate_to].index; + for e in self.cache.drain(truncate_to..) { + mem_size_change -= + (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; + } + if let Some(cached) = self.trace.back() { + // Only committed entries can be traced, and only uncommitted entries + // can be truncated. So there won't be any overlaps. + let cached_last = cached.range.end - 1; + assert!(cached_last < trunc_to_idx); + } + } else if cache_last_index + 1 < first_index { + panic!( + "[region {}] {} unexpected hole: {} < {}", + region_id, peer_id, cache_last_index, first_index + ); + } + } + + for e in entries { + self.cache.push_back(e.to_owned()); + mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; + } + // In the past, the entry cache will be truncated if its size exceeds a certain + // number. However, after introducing async write io, the entry must stay in + // cache if it's not persisted to raft db because the raft-rs may need to read + // entries.(e.g. leader sends MsgAppend to followers) + + mem_size_change + } + + pub fn entry(&self, idx: u64) -> Option<&Entry> { + let cache_low = self.cache.front()?.get_index(); + if idx >= cache_low { + Some(&self.cache[(idx - cache_low) as usize]) + } else { + None + } + } + + /// Compact all entries whose indexes are less than `idx`. + pub fn compact_to(&mut self, mut idx: u64) -> u64 { + if idx > self.persisted + 1 { + // Only the persisted entries can be compacted + idx = self.persisted + 1; + } + + let mut mem_size_change = 0; + + // Clean cached entries which have been already sent to apply threads. For + // example, if entries [1, 10), [10, 20), [20, 30) are sent to apply threads and + // `compact_to(15)` is called, only [20, 30) will still be kept in cache. + let old_trace_cap = self.trace.capacity(); + while let Some(cached_entries) = self.trace.pop_front() { + if cached_entries.range.start >= idx { + self.trace.push_front(cached_entries); + let trace_len = self.trace.len(); + let trace_cap = self.trace.capacity(); + if trace_len < SHRINK_CACHE_CAPACITY && trace_cap > SHRINK_CACHE_CAPACITY { + self.trace.shrink_to(SHRINK_CACHE_CAPACITY); + } + break; + } + let (_, dangle_size) = cached_entries.take_entries(); + mem_size_change -= dangle_size as i64; + idx = cmp::max(cached_entries.range.end, idx); + } + let new_trace_cap = self.trace.capacity(); + mem_size_change += Self::trace_vec_mem_size_change(new_trace_cap, old_trace_cap); + + let cache_first_idx = self.first_index().unwrap_or(u64::MAX); + if cache_first_idx >= idx { + self.flush_mem_size_change(mem_size_change); + assert!(mem_size_change <= 0); + return -mem_size_change as u64; + } + + let cache_last_idx = self.cache.back().unwrap().get_index(); + // Use `cache_last_idx + 1` to make sure cache can be cleared completely if + // necessary. + let compact_to = (cmp::min(cache_last_idx + 1, idx) - cache_first_idx) as usize; + for e in self.cache.drain(..compact_to) { + mem_size_change -= (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64 + } + + mem_size_change += self.shrink_if_necessary(); + self.flush_mem_size_change(mem_size_change); + assert!(mem_size_change <= 0); + -mem_size_change as u64 + } + + fn total_mem_size(&self) -> i64 { + let data_size: i64 = self + .cache + .iter() + .map(|e| (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64) + .sum(); + let cache_vec_size = Self::cache_vec_mem_size_change(self.cache.capacity(), 0); + let trace_vec_size = Self::trace_vec_mem_size_change(self.trace.capacity(), 0); + data_size + cache_vec_size + trace_vec_size + } + + fn cache_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { + ENTRY_MEM_SIZE as i64 * (new_capacity as i64 - old_capacity as i64) + } + + fn trace_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { + mem::size_of::() as i64 * (new_capacity as i64 - old_capacity as i64) + } + + fn flush_mem_size_change(&self, mem_size_change: i64) { + #[cfg(test)] + if let Some(size_change_cb) = self.size_change_cb.as_ref() { + size_change_cb(mem_size_change); + } + let event = if mem_size_change > 0 { + TraceEvent::Add(mem_size_change as usize) + } else { + TraceEvent::Sub(-mem_size_change as usize) + }; + MEMTRACE_ENTRY_CACHE.trace(event); + RAFT_ENTRIES_CACHES_GAUGE.add(mem_size_change); + } + + fn flush_stats(&self) { + let hit = self.hit.replace(0); + RAFT_ENTRY_FETCHES.hit.inc_by(hit); + let miss = self.miss.replace(0); + RAFT_ENTRY_FETCHES.miss.inc_by(miss); + } + + #[inline] + fn is_empty(&self) -> bool { + self.cache.is_empty() + } + + fn trace_cached_entries(&mut self, entries: CachedEntries) { + let dangle_size = { + let mut guard = entries.entries.lock().unwrap(); + + let last_idx = guard.0.last().map(|e| e.index).unwrap(); + let cache_front = match self.cache.front().map(|e| e.index) { + Some(i) => i, + None => u64::MAX, + }; + + let dangle_range = if last_idx < cache_front { + // All entries are not in entry cache. + 0..guard.0.len() + } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { + // Some entries are in entry cache. + 0..i + } else { + // All entries are in entry cache. + 0..0 + }; + + let mut size = 0; + for e in &guard.0[dangle_range] { + size += bytes_capacity(&e.data) + bytes_capacity(&e.context); + } + guard.1 = size; + size + }; + + let old_capacity = self.trace.capacity(); + self.trace.push_back(entries); + let new_capacity = self.trace.capacity(); + let diff = Self::trace_vec_mem_size_change(new_capacity, old_capacity); + + self.flush_mem_size_change(diff + dangle_size as i64); + } + + fn shrink_if_necessary(&mut self) -> i64 { + if self.cache.len() < SHRINK_CACHE_CAPACITY && self.cache.capacity() > SHRINK_CACHE_CAPACITY + { + let old_capacity = self.cache.capacity(); + self.cache.shrink_to_fit(); + let new_capacity = self.cache.capacity(); + return Self::cache_vec_mem_size_change(new_capacity, old_capacity); + } + 0 + } + + fn update_persisted(&mut self, persisted: u64) { + self.persisted = persisted; + } +} + +impl Default for EntryCache { + fn default() -> Self { + let entry_cache = EntryCache { + persisted: 0, + cache: Default::default(), + trace: Default::default(), + hit: Cell::new(0), + miss: Cell::new(0), + #[cfg(test)] + size_change_cb: None, + }; + entry_cache.flush_mem_size_change(entry_cache.total_mem_size()); + entry_cache + } +} + +impl Drop for EntryCache { + fn drop(&mut self) { + let mem_size_change = self.total_mem_size(); + self.flush_mem_size_change(-mem_size_change); + self.flush_stats(); + } +} + +#[derive(Debug)] +pub enum RaftlogFetchState { + // The Instant records the start time of the fetching. + Fetching(Instant), + Fetched(Box), +} + +#[derive(Debug, PartialEq)] +pub struct RaftlogFetchResult { + pub ents: raft::Result>, + // because entries may be empty, so store the original low index that the task issued + pub low: u64, + // the original max size that the task issued + pub max_size: u64, + // if the ents hit max_size + pub hit_size_limit: bool, + // the times that async fetch have already tried + pub tried_cnt: usize, + // the term when the task issued + pub term: u64, +} + +#[derive(Default)] +struct AsyncFetchStats { + async_fetch: Cell, + sync_fetch: Cell, + fallback_fetch: Cell, + fetch_invalid: Cell, + fetch_unused: Cell, +} + +impl AsyncFetchStats { + fn flush_stats(&mut self) { + RAFT_ENTRY_FETCHES + .async_fetch + .inc_by(self.async_fetch.replace(0)); + RAFT_ENTRY_FETCHES + .sync_fetch + .inc_by(self.sync_fetch.replace(0)); + RAFT_ENTRY_FETCHES + .fallback_fetch + .inc_by(self.fallback_fetch.replace(0)); + RAFT_ENTRY_FETCHES + .fetch_invalid + .inc_by(self.fetch_invalid.replace(0)); + RAFT_ENTRY_FETCHES + .fetch_unused + .inc_by(self.fetch_unused.replace(0)); + } +} + +fn validate_states( + region_id: u64, + raft_engine: &ER, + raft_state: &mut RaftLocalState, + apply_state: &RaftApplyState, +) -> Result<()> { + let last_index = raft_state.get_last_index(); + let mut commit_index = raft_state.get_hard_state().get_commit(); + let recorded_commit_index = apply_state.get_commit_index(); + let state_str = || -> String { + format!( + "region {}, raft state {:?}, apply state {:?}", + region_id, raft_state, apply_state + ) + }; + // The commit index of raft state may be less than the recorded commit index. + // If so, forward the commit index. + if commit_index < recorded_commit_index { + let entry = raft_engine.get_entry(region_id, recorded_commit_index)?; + if entry.map_or(true, |e| e.get_term() != apply_state.get_commit_term()) { + return Err(box_err!( + "log at recorded commit index [{}] {} doesn't exist, may lose data, {}", + apply_state.get_commit_term(), + recorded_commit_index, + state_str() + )); + } + info!("updating commit index"; "region_id" => region_id, "old" => commit_index, "new" => recorded_commit_index); + commit_index = recorded_commit_index; + } + // Invariant: applied index <= max(commit index, recorded commit index) + if apply_state.get_applied_index() > commit_index { + return Err(box_err!( + "applied index > max(commit index, recorded commit index), {}", + state_str() + )); + } + // Invariant: max(commit index, recorded commit index) <= last index + if commit_index > last_index { + return Err(box_err!( + "max(commit index, recorded commit index) > last index, {}", + state_str() + )); + } + // Since the entries must be persisted before applying, the term of raft state + // should also be persisted. So it should be greater than the commit term of + // apply state. + if raft_state.get_hard_state().get_term() < apply_state.get_commit_term() { + return Err(box_err!( + "term of raft state < commit term of apply state, {}", + state_str() + )); + } + + raft_state.mut_hard_state().set_commit(commit_index); + + Ok(()) +} + +pub fn init_last_term( + raft_engine: &ER, + region: &metapb::Region, + raft_state: &RaftLocalState, + apply_state: &RaftApplyState, +) -> Result { + let last_idx = raft_state.get_last_index(); + if last_idx == 0 { + return Ok(0); + } else if last_idx == RAFT_INIT_LOG_INDEX { + return Ok(RAFT_INIT_LOG_TERM); + } else if last_idx == apply_state.get_truncated_state().get_index() { + return Ok(apply_state.get_truncated_state().get_term()); + } else { + assert!(last_idx > RAFT_INIT_LOG_INDEX); + } + let entry = raft_engine.get_entry(region.get_id(), last_idx)?; + match entry { + None => Err(box_err!( + "[region {}] entry at {} doesn't exist, may lose data.", + region.get_id(), + last_idx + )), + Some(e) => Ok(e.get_term()), + } +} + +pub fn init_applied_term( + raft_engine: &ER, + region: &metapb::Region, + apply_state: &RaftApplyState, +) -> Result { + if apply_state.applied_index == RAFT_INIT_LOG_INDEX { + return Ok(RAFT_INIT_LOG_TERM); + } + let truncated_state = apply_state.get_truncated_state(); + if apply_state.applied_index == truncated_state.get_index() { + return Ok(truncated_state.get_term()); + } + + match raft_engine.get_entry(region.get_id(), apply_state.applied_index)? { + Some(e) => Ok(e.term), + None => Err(box_err!( + "[region {}] entry at apply index {} doesn't exist, may lose data.", + region.get_id(), + apply_state.applied_index + )), + } +} + +/// When a peer(follower) receives a TransferLeaderMsg, it enters the +/// CacheWarmupState. When the peer becomes leader or it doesn't +/// become leader before a deadline, it exits the state. +#[derive(Clone, Debug)] +pub struct CacheWarmupState { + range: (u64, u64), + is_task_timeout: bool, + is_stale: bool, + started_at: Instant, +} + +impl CacheWarmupState { + pub fn new() -> Self { + CacheWarmupState::new_with_range(INVALID_INDEX, INVALID_INDEX) + } + + pub fn new_with_range(low: u64, high: u64) -> Self { + CacheWarmupState { + range: (low, high), + is_task_timeout: false, + is_stale: false, + started_at: Instant::now(), + } + } + + pub fn range(&self) -> (u64, u64) { + self.range + } + + /// How long has it been in this state. + pub fn elapsed(&self) -> Duration { + self.started_at.saturating_elapsed() + } + + /// Whether the warmup task is already timeout. + pub fn is_task_timeout(&self) -> bool { + self.is_task_timeout + } + + /// Check whether the task is timeout. + pub fn check_task_timeout(&mut self, duration: Duration) -> bool { + if self.is_task_timeout { + return true; + } + if self.elapsed() > duration { + WARM_UP_ENTRY_CACHE_COUNTER.timeout.inc(); + self.is_task_timeout = true; + } + self.is_task_timeout + } + + /// Check whether this state is stale. + pub fn check_stale(&mut self, duration: Duration) -> bool { + fail_point!("entry_cache_warmed_up_state_is_stale", |_| true); + if self.is_stale { + return true; + } + if self.elapsed() > duration { + self.is_stale = true; + } + self.is_stale + } +} + +impl Default for CacheWarmupState { + fn default() -> Self { + Self::new() + } +} + +/// A subset of `PeerStorage` that focus on accessing log entries. +pub struct EntryStorage { + region_id: u64, + peer_id: u64, + raft_engine: ER, + cache: EntryCache, + raft_state: RaftLocalState, + apply_state: RaftApplyState, + last_term: u64, + applied_term: u64, + read_scheduler: Scheduler>, + raftlog_fetch_stats: AsyncFetchStats, + async_fetch_results: RefCell>, + cache_warmup_state: Option, +} + +impl EntryStorage { + pub fn new( + peer_id: u64, + raft_engine: ER, + mut raft_state: RaftLocalState, + apply_state: RaftApplyState, + region: &metapb::Region, + read_scheduler: Scheduler>, + ) -> Result { + if let Err(e) = validate_states(region.id, &raft_engine, &mut raft_state, &apply_state) { + return Err(box_err!( + "[region {}] {} validate state fail: {:?}", + region.id, + peer_id, + e + )); + } + let last_term = init_last_term(&raft_engine, region, &raft_state, &apply_state)?; + let applied_term = init_applied_term(&raft_engine, region, &apply_state)?; + Ok(Self { + region_id: region.id, + peer_id, + raft_engine, + cache: EntryCache::default(), + raft_state, + apply_state, + last_term, + applied_term, + read_scheduler, + raftlog_fetch_stats: AsyncFetchStats::default(), + async_fetch_results: RefCell::new(HashMap::default()), + cache_warmup_state: None, + }) + } + + fn check_range(&self, low: u64, high: u64) -> raft::Result<()> { + if low > high { + return Err(storage_error(format!( + "low: {} is greater that high: {}", + low, high + ))); + } else if low <= self.truncated_index() { + return Err(raft::Error::Store(StorageError::Compacted)); + } else if high > self.last_index() + 1 { + return Err(storage_error(format!( + "entries' high {} is out of bound lastindex {}", + high, + self.last_index() + ))); + } + Ok(()) + } + + pub fn clean_async_fetch_res(&mut self, low: u64) { + self.async_fetch_results.borrow_mut().remove(&low); + } + + // Update the async fetch result. + // None indicates cleanning the fetched result. + pub fn update_async_fetch_res(&mut self, low: u64, res: Option>) { + // If it's in fetching, don't clean the async fetch result. + if let Some(RaftlogFetchState::Fetching(_)) = self.async_fetch_results.borrow().get(&low) { + if res.is_none() { + return; + } + } + + match res { + Some(res) => { + match self + .async_fetch_results + .borrow_mut() + .insert(low, RaftlogFetchState::Fetched(res)) + { + Some(RaftlogFetchState::Fetching(start)) => { + RAFT_ENTRY_FETCHES_TASK_DURATION_HISTOGRAM + .observe(start.saturating_elapsed_secs()); + } + Some(RaftlogFetchState::Fetched(prev)) => { + info!( + "unconsumed async fetch res"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "res" => ?prev, + "low" => low, + ); + } + _ => { + warn!( + "unknown async fetch res"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "low" => low, + ); + } + } + } + None => { + let prev = self.async_fetch_results.borrow_mut().remove(&low); + if prev.is_some() { + self.raftlog_fetch_stats.fetch_unused.update(|m| m + 1); + } + } + } + } + + fn async_fetch( + &self, + region_id: u64, + low: u64, + high: u64, + max_size: u64, + context: GetEntriesContext, + buf: &mut Vec, + ) -> raft::Result { + if let Some(RaftlogFetchState::Fetching(_)) = self.async_fetch_results.borrow().get(&low) { + // already an async fetch in flight + return Err(raft::Error::Store( + raft::StorageError::LogTemporarilyUnavailable, + )); + } + + let tried_cnt = if let Some(RaftlogFetchState::Fetched(res)) = + self.async_fetch_results.borrow_mut().remove(&low) + { + assert_eq!(res.low, low); + let mut ents = res.ents?; + let first = ents.first().map(|e| e.index).unwrap(); + assert_eq!(first, res.low); + let last = ents.last().map(|e| e.index).unwrap(); + + if last + 1 >= high { + // async fetch res covers [low, high) + ents.truncate((high - first) as usize); + assert_eq!(ents.last().map(|e| e.index).unwrap(), high - 1); + if max_size < res.max_size { + limit_size(&mut ents, Some(max_size)); + } + let count = ents.len(); + buf.append(&mut ents); + fail_point!("on_async_fetch_return"); + return Ok(count); + } else if res.hit_size_limit && max_size <= res.max_size { + // async fetch res doesn't cover [low, high) due to hit size limit + if max_size < res.max_size { + limit_size(&mut ents, Some(max_size)); + }; + let count = ents.len(); + buf.append(&mut ents); + return Ok(count); + } else if last + RAFT_LOG_MULTI_GET_CNT > high - 1 + && res.tried_cnt + 1 == MAX_ASYNC_FETCH_TRY_CNT + { + let mut fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size() as u64); + if max_size <= fetched_size { + limit_size(&mut ents, Some(max_size)); + let count = ents.len(); + buf.append(&mut ents); + return Ok(count); + } + + // the count of left entries isn't too large, fetch the remaining entries + // synchronously one by one + for idx in last + 1..high { + let ent = self.raft_engine.get_entry(region_id, idx)?; + match ent { + None => { + return Err(raft::Error::Store(raft::StorageError::Unavailable)); + } + Some(ent) => { + let size = ent.compute_size() as u64; + if fetched_size + size > max_size { + break; + } else { + fetched_size += size; + ents.push(ent); + } + } + } + } + let count = ents.len(); + buf.append(&mut ents); + return Ok(count); + } + info!( + "async fetch invalid"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "first" => first, + "last" => last, + "low" => low, + "high" => high, + "max_size" => max_size, + "res_max_size" => res.max_size, + ); + // low index or max size is changed, the result is not fit for the current + // range, so refetch again. + self.raftlog_fetch_stats.fetch_invalid.update(|m| m + 1); + res.tried_cnt + 1 + } else { + 1 + }; + + // the first/second try: get [low, high) asynchronously + // the third try: + // - if term and low are matched: use result of [low, persisted) and get + // [persisted, high) synchronously + // - else: get [low, high) synchronously + if tried_cnt >= MAX_ASYNC_FETCH_TRY_CNT { + // even the larger range is invalid again, fallback to fetch in sync way + self.raftlog_fetch_stats.fallback_fetch.update(|m| m + 1); + let count = self.raft_engine.fetch_entries_to( + region_id, + low, + high, + Some(max_size as usize), + buf, + )?; + return Ok(count); + } + + self.raftlog_fetch_stats.async_fetch.update(|m| m + 1); + self.async_fetch_results + .borrow_mut() + .insert(low, RaftlogFetchState::Fetching(Instant::now_coarse())); + self.read_scheduler + .schedule(ReadTask::FetchLogs { + region_id, + context, + low, + high, + max_size: (max_size as usize), + tried_cnt, + term: self.hard_state().get_term(), + }) + .unwrap(); + Err(raft::Error::Store( + raft::StorageError::LogTemporarilyUnavailable, + )) + } + + pub fn entries( + &self, + low: u64, + high: u64, + max_size: u64, + context: GetEntriesContext, + ) -> raft::Result> { + self.check_range(low, high)?; + let mut ents = + Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); + if low == high { + return Ok(ents); + } + let cache_low = self.cache.first_index().unwrap_or(u64::MAX); + if high <= cache_low { + self.cache.miss.update(|m| m + 1); + return if context.can_async() { + self.async_fetch(self.region_id, low, high, max_size, context, &mut ents)?; + Ok(ents) + } else { + self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); + self.raft_engine.fetch_entries_to( + self.region_id, + low, + high, + Some(max_size as usize), + &mut ents, + )?; + Ok(ents) + }; + } + let begin_idx = if low < cache_low { + self.cache.miss.update(|m| m + 1); + let fetched_count = if context.can_async() { + self.async_fetch(self.region_id, low, cache_low, max_size, context, &mut ents)? + } else { + self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); + self.raft_engine.fetch_entries_to( + self.region_id, + low, + cache_low, + Some(max_size as usize), + &mut ents, + )? + }; + if fetched_count < (cache_low - low) as usize { + // Less entries are fetched than expected. + return Ok(ents); + } + cache_low + } else { + low + }; + self.cache.hit.update(|h| h + 1); + let fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size()); + self.cache + .fetch_entries_to(begin_idx, high, fetched_size as u64, max_size, &mut ents); + Ok(ents) + } + + pub fn term(&self, idx: u64) -> raft::Result { + if idx == self.truncated_index() { + return Ok(self.truncated_term()); + } + self.check_range(idx, idx + 1)?; + if self.truncated_term() == self.last_term || idx == self.last_index() { + return Ok(self.last_term); + } + if let Some(e) = self.cache.entry(idx) { + Ok(e.get_term()) + } else { + Ok(self + .raft_engine + .get_entry(self.region_id, idx) + .unwrap() + .unwrap_or_else(|| { + panic!( + "region_id={}, peer_id={}, idx={idx}", + self.region_id, self.peer_id + ) + }) + .get_term()) + } + } + + #[inline] + pub fn set_truncated_index(&mut self, index: u64) { + self.apply_state.mut_truncated_state().set_index(index) + } + + #[inline] + pub fn set_truncated_term(&mut self, term: u64) { + self.apply_state.mut_truncated_state().set_term(term) + } + + #[inline] + pub fn first_index(&self) -> u64 { + first_index(&self.apply_state) + } + + #[inline] + pub fn last_index(&self) -> u64 { + last_index(&self.raft_state) + } + + #[inline] + pub fn last_term(&self) -> u64 { + self.last_term + } + + #[inline] + pub fn set_last_term(&mut self, term: u64) { + self.last_term = term; + } + + #[inline] + pub fn set_applied_term(&mut self, applied_term: u64) { + self.applied_term = applied_term; + } + + #[inline] + pub fn applied_term(&self) -> u64 { + self.applied_term + } + + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + &self.raft_state + } + + #[inline] + pub fn raft_state_mut(&mut self) -> &mut RaftLocalState { + &mut self.raft_state + } + + #[inline] + pub fn applied_index(&self) -> u64 { + self.apply_state.get_applied_index() + } + + #[inline] + pub fn set_apply_state(&mut self, apply_state: RaftApplyState) { + self.apply_state = apply_state; + } + + #[inline] + pub fn apply_state(&self) -> &RaftApplyState { + &self.apply_state + } + + #[inline] + pub fn apply_state_mut(&mut self) -> &mut RaftApplyState { + &mut self.apply_state + } + + #[inline] + pub fn commit_index(&self) -> u64 { + self.raft_state.get_hard_state().get_commit() + } + + #[inline] + pub fn set_commit_index(&mut self, commit: u64) { + assert!(commit >= self.commit_index()); + self.raft_state.mut_hard_state().set_commit(commit); + } + + #[inline] + pub fn hard_state(&self) -> &HardState { + self.raft_state.get_hard_state() + } + + #[inline] + pub fn truncated_index(&self) -> u64 { + self.apply_state.get_truncated_state().get_index() + } + + #[inline] + pub fn truncated_term(&self) -> u64 { + self.apply_state.get_truncated_state().get_term() + } + + // Append the given entries to the raft log using previous last index or + // self.last_index. + pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { + if entries.is_empty() { + return; + } + debug!( + "append entries"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "count" => entries.len(), + ); + let prev_last_index = self.raft_state.get_last_index(); + + let (last_index, last_term) = { + let e = entries.last().unwrap(); + (e.get_index(), e.get_term()) + }; + + self.cache.append(self.region_id, self.peer_id, &entries); + + // Delete any previously appended log entries which never committed. + task.set_append(Some(prev_last_index + 1), entries); + + self.raft_state.set_last_index(last_index); + self.last_term = last_term; + } + + pub fn entry_cache_warmup_state(&self) -> &Option { + &self.cache_warmup_state + } + + pub fn entry_cache_warmup_state_mut(&mut self) -> &mut Option { + &mut self.cache_warmup_state + } + + pub fn clear_entry_cache_warmup_state(&mut self) { + self.cache_warmup_state = None; + } + + /// Trigger a task to warm up the entry cache. + /// + /// This will ensure the range [low..=last_index] are loaded into + /// cache. Return the high index of the warmup range if a task is + /// successfully triggered. + pub fn async_warm_up_entry_cache(&mut self, low: u64) -> Option { + let high = if let Some(first_index) = self.entry_cache_first_index() { + if low >= first_index { + // Already warmed up. + self.cache_warmup_state = Some(CacheWarmupState::new()); + return None; + } + // Partially warmed up. + first_index + } else { + self.last_index() + 1 + }; + + // Fetch entries [low, high) to trigger an async fetch task in background. + self.cache_warmup_state = Some(CacheWarmupState::new_with_range(low, high)); + match self.entries(low, high, u64::MAX, GetEntriesContext::empty(true)) { + Ok(_) => { + // This should not happen, but it's OK :) + debug_assert!(false, "entries should not have been fetched"); + error!("entries are fetched unexpectedly during warming up"); + None + } + Err(raft::Error::Store(raft::StorageError::LogTemporarilyUnavailable)) => { + WARM_UP_ENTRY_CACHE_COUNTER.started.inc(); + Some(high) + } + Err(e) => { + error!( + "fetching entries met unexpected error during warming up"; + "err" => ?e, + ); + None + } + } + } + + /// Warm up entry cache if the result is valid. + /// + /// Return true when the warmup operation succeed within the timeout. + pub fn maybe_warm_up_entry_cache(&mut self, res: RaftlogFetchResult) -> bool { + let low = res.low; + // Warm up the entry cache if the low and high index are + // exactly the same as the warmup range. + let state = self.entry_cache_warmup_state().as_ref().unwrap(); + let range = state.range(); + let is_task_timeout = state.is_task_timeout(); + + if range.0 != low { + return false; + } + + match res.ents { + Ok(mut entries) => { + let last_entry_index = entries.last().map(|e| e.index); + if let Some(index) = last_entry_index { + // Generally speaking, when the res.low is the same as the warmup + // range start, the fetch result is exactly used for warmup. + // As the low index of each async_fetch task is different. + // There should exist only one exception. A async fetch task + // with same low index is triggered before the warmup task. + if index + 1 >= range.1 { + let is_valid = if let Some(first_index) = self.entry_cache_first_index() { + range.1 == first_index + } else { + range.1 == self.last_index() + 1 + }; + assert!(is_valid, "the warmup range should still be valid"); + entries.truncate((range.1 - range.0) as usize); + self.cache.prepend(entries); + WARM_UP_ENTRY_CACHE_COUNTER.finished.inc(); + fail_point!("on_entry_cache_warmed_up"); + return !is_task_timeout; + } + } + warn!( + "warm up the entry cache failed"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "last_entry_index" => last_entry_index.unwrap_or(0), + "expected_high" => range.1, + ); + } + Err(e) => { + warn!( + "warm up the entry cache failed"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "err" => ?e, + ); + } + } + false + } + + pub fn compact_entry_cache(&mut self, idx: u64) { + let mut can_compact = true; + if let Some(state) = self.entry_cache_warmup_state_mut() { + if state.check_stale(MAX_WARMED_UP_CACHE_KEEP_TIME) { + self.clear_entry_cache_warmup_state(); + } else { + can_compact = false; + } + } + if can_compact { + self.cache.compact_to(idx); + } + } + + #[inline] + pub fn is_entry_cache_empty(&self) -> bool { + self.cache.is_empty() + } + + #[inline] + pub fn entry_cache_first_index(&self) -> Option { + self.cache.first_index() + } + + /// Evict entries from the cache. + pub fn evict_entry_cache(&mut self, half: bool) { + if !self.is_entry_cache_empty() { + let cache = &mut self.cache; + let cache_len = cache.cache.len(); + let drain_to = if half { cache_len / 2 } else { cache_len - 1 }; + let idx = cache.cache[drain_to].index; + let mem_size_change = cache.compact_to(idx + 1); + RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); + } else if !half { + let cache = &mut self.cache; + let mem_size_change = cache.compact_to(u64::MAX); + RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); + } + } + + #[inline] + pub fn flush_entry_cache_metrics(&mut self) { + // NOTE: memory usage of entry cache is flushed realtime. + self.cache.flush_stats(); + self.raftlog_fetch_stats.flush_stats(); + } + + pub fn raft_engine(&self) -> &ER { + &self.raft_engine + } + + pub fn update_cache_persisted(&mut self, persisted: u64) { + self.cache.update_persisted(persisted); + } + + pub fn trace_cached_entries(&mut self, entries: CachedEntries) { + self.cache.trace_cached_entries(entries); + } + + pub fn clear(&mut self) { + self.cache = EntryCache::default(); + } + + pub fn read_scheduler(&self) -> Scheduler> { + self.read_scheduler.clone() + } +} + +#[cfg(test)] +pub mod tests { + use std::sync::mpsc; + + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; + use engine_traits::RaftEngineReadOnly; + use protobuf::Message; + use raft::{GetEntriesContext, StorageError}; + use tempfile::Builder; + use tikv_util::worker::{dummy_scheduler, LazyWorker, Worker}; + + use super::*; + use crate::store::peer_storage::tests::{append_ents, new_entry, new_storage_from_ents}; + + impl EntryCache { + fn new_with_cb(cb: impl Fn(i64) + Send + 'static) -> Self { + let entry_cache = EntryCache { + persisted: 0, + cache: Default::default(), + trace: Default::default(), + hit: Cell::new(0), + miss: Cell::new(0), + size_change_cb: Some(Box::new(cb) as Box), + }; + entry_cache.flush_mem_size_change(entry_cache.total_mem_size()); + entry_cache + } + } + + pub fn validate_cache(store: &EntryStorage, exp_ents: &[Entry]) { + assert_eq!(store.cache.cache, exp_ents); + for e in exp_ents { + let entry = store + .raft_engine + .get_entry(store.region_id, e.get_index()) + .unwrap() + .unwrap(); + assert_eq!(entry, *e); + } + } + + #[test] + fn test_storage_cache_size_change() { + let new_padded_entry = |index: u64, term: u64, pad_len: usize| { + let mut e = new_entry(index, term); + e.data = vec![b'x'; pad_len].into(); + e + }; + + // Test the initial data structure size. + let (tx, rx) = mpsc::sync_channel(8); + let mut cache = EntryCache::new_with_cb(move |c: i64| tx.send(c).unwrap()); + assert_eq!(rx.try_recv().unwrap(), 896); + + cache.append( + 0, + 0, + &[new_padded_entry(101, 1, 1), new_padded_entry(102, 1, 2)], + ); + assert_eq!(rx.try_recv().unwrap(), 3); + + cache.prepend(vec![new_padded_entry(100, 1, 1)]); + assert_eq!(rx.try_recv().unwrap(), 1); + cache.persisted = 100; + cache.compact_to(101); + assert_eq!(rx.try_recv().unwrap(), -1); + + // Test size change for one overlapped entry. + cache.append(0, 0, &[new_padded_entry(102, 2, 3)]); + assert_eq!(rx.try_recv().unwrap(), 1); + + // Test size change for all overlapped entries. + cache.append( + 0, + 0, + &[new_padded_entry(101, 3, 4), new_padded_entry(102, 3, 5)], + ); + assert_eq!(rx.try_recv().unwrap(), 5); + + cache.append(0, 0, &[new_padded_entry(103, 3, 6)]); + assert_eq!(rx.try_recv().unwrap(), 6); + + // Test trace a dangle entry. + let cached_entries = CachedEntries::new(vec![new_padded_entry(100, 1, 1)]); + cache.trace_cached_entries(cached_entries); + assert_eq!(rx.try_recv().unwrap(), 1); + + // Test trace an entry which is still in cache. + let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); + cache.trace_cached_entries(cached_entries); + assert_eq!(rx.try_recv().unwrap(), 0); + + // Test compare `cached_last` with `trunc_to_idx` in `EntryCache::append_impl`. + cache.append(0, 0, &[new_padded_entry(103, 4, 7)]); + assert_eq!(rx.try_recv().unwrap(), 1); + + // Test compact one traced dangle entry and one entry in cache. + cache.persisted = 101; + cache.compact_to(102); + assert_eq!(rx.try_recv().unwrap(), -5); + + // Test compact the last traced dangle entry. + cache.persisted = 102; + cache.compact_to(103); + assert_eq!(rx.try_recv().unwrap(), -5); + + // Test compact all entries. + cache.persisted = 103; + cache.compact_to(104); + assert_eq!(rx.try_recv().unwrap(), -7); + + drop(cache); + assert_eq!(rx.try_recv().unwrap(), -896); + } + + #[test] + fn test_storage_cache_entry() { + let mut cache = EntryCache::default(); + let ents = vec![ + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 4), + new_entry(6, 6), + ]; + cache.append(0, 0, &ents); + assert!(cache.entry(1).is_none()); + assert!(cache.entry(2).is_none()); + for e in &ents { + assert_eq!(e, cache.entry(e.get_index()).unwrap()); + } + let res = panic_hook::recover_safe(|| cache.entry(7)); + res.unwrap_err(); + } + + #[test] + fn test_async_fetch() { + let ents = vec![ + new_entry(2, 2), + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 5), + new_entry(6, 6), + ]; + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); + let region_scheduler = region_worker.scheduler(); + let (dummy_scheduler, _rx) = dummy_scheduler(); + + let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); + + let max_u64 = u64::max_value(); + let mut tests = vec![ + // already compacted + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Err(raft::Error::Store(StorageError::Compacted)), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::Compacted)), + vec![], + ), + // fetch partial entries due to max size limit + ( + 3, + 7, + 30, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: 30, + hit_size_limit: true, + tried_cnt: 1, + term: 1, + }, + Ok(3), + ents[1..4].to_vec(), + ), + // fetch all entries + ( + 2, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents.clone()), + low: 2, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(5), + ents.clone(), + ), + // high is smaller than before + ( + 3, + 5, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(2), + ents[1..3].to_vec(), + ), + // high is larger than before, second try + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::LogTemporarilyUnavailable)), + vec![], + ), + // high is larger than before, thrid try + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 2, + term: 1, + }, + Ok(4), + ents[1..].to_vec(), + ), + // max size is smaller than before + ( + 2, + 7, + 10, + 1, + RaftlogFetchResult { + ents: Ok(ents.clone()), + low: 2, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(2), + ents[..2].to_vec(), + ), + // max size is larger than before but with lower high + ( + 2, + 5, + 40, + 1, + RaftlogFetchResult { + ents: Ok(ents.clone()), + low: 2, + max_size: 30, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(3), + ents[..3].to_vec(), + ), + // low index is smaller than before + ( + 2, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Err(raft::Error::Store(StorageError::Compacted)), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::LogTemporarilyUnavailable)), + vec![], + ), + // low index is larger than before + ( + 4, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(vec![]), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::LogTemporarilyUnavailable)), + vec![], + ), + // hit tried several lmit + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, + term: 1, + }, + Ok(4), + ents[1..5].to_vec(), + ), + // term is changed + ( + 3, + 7, + max_u64, + 2, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, + term: 1, + }, + Ok(4), + ents[1..5].to_vec(), + ), + ]; + + for (i, (lo, hi, maxsize, term, async_res, expected_res, expected_ents)) in + tests.drain(..).enumerate() + { + if async_res.low != lo { + store.clean_async_fetch_res(lo); + } else { + store.update_async_fetch_res(lo, Some(Box::new(async_res))); + } + let mut ents = vec![]; + store.raft_state.mut_hard_state().set_term(term); + let res = store.async_fetch( + store.get_region_id(), + lo, + hi, + maxsize, + GetEntriesContext::empty(true), + &mut ents, + ); + if res != expected_res { + panic!("#{}: expect result {:?}, got {:?}", i, expected_res, res); + } + if ents != expected_ents { + panic!("#{}: expect ents {:?}, got {:?}", i, expected_ents, ents); + } + } + } + + #[test] + fn test_storage_append() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut tests = vec![ + ( + vec![new_entry(4, 6), new_entry(5, 6)], + vec![new_entry(4, 6), new_entry(5, 6)], + ), + ( + vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], + vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], + ), + // truncate the existing entries and append + (vec![new_entry(4, 5)], vec![new_entry(4, 5)]), + // direct append + ( + vec![new_entry(6, 5)], + vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], + ), + ]; + for (i, (entries, wentries)) in tests.drain(..).enumerate() { + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let worker = LazyWorker::new("snap-manager"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); + append_ents(&mut store, &entries); + let li = store.last_index().unwrap(); + let actual_entries = store + .entries(4, li + 1, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + if actual_entries != wentries { + panic!("#{}: want {:?}, got {:?}", i, wentries, actual_entries); + } + } + } + + #[test] + fn test_storage_cache_fetch() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let worker = LazyWorker::new("snap-manager"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); + store.cache.cache.clear(); + // empty cache should fetch data from rocksdb directly. + let mut res = store + .entries(4, 6, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(*res, ents[1..]); + + let entries = vec![new_entry(6, 5), new_entry(7, 5)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // direct cache access + res = store + .entries(6, 8, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(res, entries); + + // size limit should be supported correctly. + res = store + .entries(4, 8, 0, GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(res, vec![new_entry(4, 4)]); + let mut size: u64 = ents[1..].iter().map(|e| u64::from(e.compute_size())).sum(); + res = store + .entries(4, 8, size, GetEntriesContext::empty(false)) + .unwrap(); + let mut exp_res = ents[1..].to_vec(); + assert_eq!(res, exp_res); + for e in &entries { + size += u64::from(e.compute_size()); + exp_res.push(e.clone()); + res = store + .entries(4, 8, size, GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(res, exp_res); + } + + // range limit should be supported correctly. + for low in 4..9 { + for high in low..9 { + let res = store + .entries(low, high, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(*res, exp_res[low as usize - 4..high as usize - 4]); + } + } + } + + #[test] + fn test_storage_cache_update() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let worker = LazyWorker::new("snap-manager"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); + store.cache.cache.clear(); + + // initial cache + let mut entries = vec![new_entry(6, 5), new_entry(7, 5)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // rewrite + entries = vec![new_entry(6, 6), new_entry(7, 6)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + store.cache.prepend(vec![new_entry(6, 5)]); + + // rewrite old entry + entries = vec![new_entry(5, 6), new_entry(6, 6)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // partial rewrite + entries = vec![new_entry(6, 7), new_entry(7, 7)]; + append_ents(&mut store, &entries); + let mut exp_res = vec![new_entry(5, 6), new_entry(6, 7), new_entry(7, 7)]; + validate_cache(&store, &exp_res); + + // direct append + entries = vec![new_entry(8, 7), new_entry(9, 7)]; + append_ents(&mut store, &entries); + exp_res.extend_from_slice(&entries); + validate_cache(&store, &exp_res); + + // rewrite middle + entries = vec![new_entry(7, 8)]; + append_ents(&mut store, &entries); + exp_res.truncate(2); + exp_res.push(new_entry(7, 8)); + validate_cache(&store, &exp_res); + + // compact to min(5 + 1, 7) + store.cache.persisted = 5; + store.compact_entry_cache(7); + exp_res = vec![new_entry(6, 7), new_entry(7, 8)]; + validate_cache(&store, &exp_res); + + // compact to min(7 + 1, 7) + store.cache.persisted = 7; + store.compact_entry_cache(7); + exp_res = vec![new_entry(7, 8)]; + validate_cache(&store, &exp_res); + // compact all + store.compact_entry_cache(8); + validate_cache(&store, &[]); + // invalid compaction should be ignored. + store.compact_entry_cache(6); + } + + #[test] + fn test_async_warm_up_entry_cache() { + let ents = vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 6)]; + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); + let region_scheduler = region_worker.scheduler(); + let (dummy_scheduler, _rx) = dummy_scheduler(); + + let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); + store.cache.compact_to(6); + assert_eq!(store.entry_cache_first_index().unwrap(), 6); + + // The return value should be None when it is already warmed up. + assert!(store.async_warm_up_entry_cache(6).is_none()); + + // The high index should be equal to the entry_cache_first_index. + assert_eq!(store.async_warm_up_entry_cache(5).unwrap(), 6); + + store.cache.compact_to(7); // Clean cache. + // The high index should be equal to the last_index + 1. + assert_eq!(store.async_warm_up_entry_cache(5).unwrap(), 7); + } + + #[test] + fn test_warmup_entry_cache() { + let ents = vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 6)]; + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); + let region_scheduler = region_worker.scheduler(); + let (dummy_scheduler, _rx) = dummy_scheduler(); + let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); + store.cache.compact_to(6); + store.cache_warmup_state = Some(CacheWarmupState::new_with_range(5, 6)); + + let res = RaftlogFetchResult { + ents: Ok(ents[1..3].to_vec()), + low: 5, + max_size: u64::MAX, + hit_size_limit: false, + tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, + term: 1, + }; + store.maybe_warm_up_entry_cache(res); + // Cache should be warmed up. + assert_eq!(store.entry_cache_first_index().unwrap(), 5); + } +} diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index a7c534ff823..d1ba6d4e774 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -9,6 +9,7 @@ use std::{ cmp::{Ord, Ordering as CmpOrdering}, collections::VecDeque, fmt::{self, Debug, Formatter}, + io::BufRead, mem, ops::{Deref, DerefMut, Range as StdRange}, sync::{ @@ -28,27 +29,29 @@ use batch_system::{ use collections::{HashMap, HashMapEntry, HashSet}; use crossbeam::channel::{TryRecvError, TrySendError}; use engine_traits::{ - DeleteStrategy, KvEngine, Mutable, PerfContext, PerfContextKind, RaftEngine, - RaftEngineReadOnly, Range as EngineRange, Snapshot, SstMetaInfo, WriteBatch, ALL_CFS, - CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + util::SequenceNumber, DeleteStrategy, KvEngine, Mutable, PerfContext, PerfContextKind, + RaftEngine, RaftEngineReadOnly, Range as EngineRange, Snapshot, SstMetaInfo, WriteBatch, + ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use fail::fail_point; use kvproto::{ import_sstpb::SstMeta, kvrpcpb::ExtraOp as TxnExtraOp, - metapb::{PeerRole, Region, RegionEpoch}, + metapb::{self, PeerRole, Region, RegionEpoch}, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - RaftCmdRequest, RaftCmdResponse, Request, + RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; -use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; +use protobuf::{wire_format::WireType, CodedInputStream, Message}; use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; use raft_proto::ConfChangeI; +use resource_control::{ResourceConsumeType, ResourceController, ResourceMetered}; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -59,57 +62,57 @@ use tikv_util::{ memory::HeapSize, mpsc::{loose_bounded, LooseBoundedSender, Receiver}, safe_panic, slow_log, + store::{find_peer, find_peer_by_id, find_peer_mut, is_learner, remove_peer}, time::{duration_to_sec, Instant}, warn, worker::Scheduler, Either, MustConsumeVec, }; use time::Timespec; +use tracker::GLOBAL_TRACKERS; use uuid::Builder as UuidBuilder; use self::memtrace::*; use super::metrics::*; use crate::{ bytes_capacity, - coprocessor::{Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel}, + coprocessor::{ + ApplyCtxInfo, Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel, + RegionState, + }, store::{ cmd_resp, + entry_storage::{self, CachedEntries}, fsm::RaftPollerBuilder, local_metrics::RaftMetrics, memory::*, metrics::*, - msg::{Callback, PeerMsg, ReadResponse, SignificantMsg}, + msg::{Callback, ErrorCallback, PeerMsg, ReadResponse, SignificantMsg}, peer::Peer, - peer_storage::{self, write_initial_apply_state, write_peer_state, CachedEntries}, - util, + peer_storage::{write_initial_apply_state, write_peer_state}, util::{ - admin_cmd_epoch_lookup, check_region_epoch, compare_region_epoch, is_learner, - ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, + self, admin_cmd_epoch_lookup, check_flashback_state, check_req_region_epoch, + compare_region_epoch, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, }, - Config, RegionSnapshot, RegionTask, + Config, RegionSnapshot, RegionTask, WriteCallback, }, Error, Result, }; -const DEFAULT_APPLY_WB_SIZE: usize = 4 * 1024; -const APPLY_WB_SHRINK_SIZE: usize = 1024 * 1024; -const SHRINK_PENDING_CMD_QUEUE_CAP: usize = 64; -const MAX_APPLY_BATCH_SIZE: usize = 64 * 1024 * 1024; +// These consts are shared in both v1 and v2. +pub const DEFAULT_APPLY_WB_SIZE: usize = 4 * 1024; +pub const APPLY_WB_SHRINK_SIZE: usize = 1024 * 1024; +pub const SHRINK_PENDING_CMD_QUEUE_CAP: usize = 64; +pub const MAX_APPLY_BATCH_SIZE: usize = 64 * 1024 * 1024; -pub struct PendingCmd -where - S: Snapshot, -{ +pub struct PendingCmd { pub index: u64, pub term: u64, - pub cb: Option>, + pub cb: Option, } -impl PendingCmd -where - S: Snapshot, -{ - fn new(index: u64, term: u64, cb: Callback) -> PendingCmd { +impl PendingCmd { + fn new(index: u64, term: u64, cb: C) -> PendingCmd { PendingCmd { index, term, @@ -118,10 +121,7 @@ where } } -impl Drop for PendingCmd -where - S: Snapshot, -{ +impl Drop for PendingCmd { fn drop(&mut self) { if self.cb.is_some() { safe_panic!( @@ -133,10 +133,7 @@ where } } -impl Debug for PendingCmd -where - S: Snapshot, -{ +impl Debug for PendingCmd { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, @@ -148,30 +145,26 @@ where } } -impl HeapSize for PendingCmd {} +impl HeapSize for PendingCmd {} /// Commands waiting to be committed and applied. #[derive(Debug)] -pub struct PendingCmdQueue -where - S: Snapshot, -{ - normals: VecDeque>, - conf_change: Option>, +pub struct PendingCmdQueue { + normals: VecDeque>, + conf_change: Option>, + compacts: VecDeque>, } -impl PendingCmdQueue -where - S: Snapshot, -{ - fn new() -> PendingCmdQueue { +impl PendingCmdQueue { + fn new() -> PendingCmdQueue { PendingCmdQueue { normals: VecDeque::new(), conf_change: None, + compacts: VecDeque::new(), } } - fn pop_normal(&mut self, index: u64, term: u64) -> Option> { + fn pop_normal(&mut self, index: u64, term: u64) -> Option> { self.normals.pop_front().and_then(|cmd| { if self.normals.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP && self.normals.len() < SHRINK_PENDING_CMD_QUEUE_CAP @@ -186,20 +179,37 @@ where }) } - fn append_normal(&mut self, cmd: PendingCmd) { + fn append_normal(&mut self, cmd: PendingCmd) { self.normals.push_back(cmd); } - fn take_conf_change(&mut self) -> Option> { + fn take_conf_change(&mut self) -> Option> { // conf change will not be affected when changing between follower and leader, // so there is no need to check term. self.conf_change.take() } // TODO: seems we don't need to separate conf change from normal entries. - fn set_conf_change(&mut self, cmd: PendingCmd) { + fn set_conf_change(&mut self, cmd: PendingCmd) { self.conf_change = Some(cmd); } + + fn push_compact(&mut self, cmd: PendingCmd) { + self.compacts.push_back(cmd); + } + + fn pop_compact(&mut self, index: u64) -> Option> { + let mut front = None; + while self.compacts.front().map_or(false, |c| c.index < index) { + front = self.compacts.pop_front(); + front.as_mut().unwrap().cb.take().unwrap(); + } + front + } + + fn has_compact(&mut self) -> bool { + !self.compacts.is_empty() + } } #[derive(Default, Debug)] @@ -243,12 +253,20 @@ impl Range { } } +#[derive(Default, Debug)] +pub struct SwitchWitness { + pub index: u64, + pub switches: Vec, + pub region: Region, +} + #[derive(Debug)] pub enum ExecResult { ChangePeer(ChangePeer), CompactLog { state: RaftTruncatedState, first_index: u64, + has_pending: bool, }, SplitRegion { regions: Vec, @@ -288,9 +306,20 @@ pub enum ExecResult { TransferLeader { term: u64, }, + SetFlashbackState { + region: Region, + }, + BatchSwitchWitness(SwitchWitness), + // The raftstore thread will use it to update the internal state of `PeerFsm`. If it is + // `true`, when the raftstore detects that the raft log has not been gc for a long time, + // the raftstore thread will actively pull the `voter_replicated_index` from the leader + // and try to compact pending gc. If false, raftstore does not do any additional + // processing. + HasPendingCompactCmd(bool), } /// The possible returned value when applying logs. +#[derive(Debug)] pub enum ApplyResult { None, Yield, @@ -386,20 +415,29 @@ where perf_context: EK::PerfContext, yield_duration: Duration, + yield_msg_size: u64, store_id: u64, /// region_id -> (peer_id, is_splitting) /// Used for handling race between splitting and creating new peer. - /// An uninitialized peer can be replaced to the one from splitting iff they are exactly the same peer. + /// An uninitialized peer can be replaced to the one from splitting iff they + /// are exactly the same peer. pending_create_peers: Arc>>, - /// We must delete the ingested file before calling `callback` so that any ingest-request reaching this - /// peer could see this update if leader had changed. We must also delete them after the applied-index - /// has been persisted to kvdb because this entry may replay because of panic or power-off, which - /// happened before `WriteBatch::write` and after `SstImporter::delete`. We shall make sure that - /// this entry will never apply again at first, then we can delete the ssts files. + /// We must delete the ingested file before calling `callback` so that any + /// ingest-request reaching this peer could see this update if leader + /// had changed. We must also delete them after the applied-index + /// has been persisted to kvdb because this entry may replay because of + /// panic or power-off, which happened before `WriteBatch::write` and + /// after `SstImporter::delete`. We shall make sure that this entry will + /// never apply again at first, then we can delete the ssts files. delete_ssts: Vec, + /// A self-defined engine may be slow to ingest ssts. + /// It may move some elements of `delete_ssts` into `pending_delete_ssts` to + /// delay deletion. Otherwise we may lost data. + pending_delete_ssts: Vec, + /// The priority of this Handler. priority: Priority, /// Whether to yield high-latency operation to low-priority handler. @@ -414,6 +452,19 @@ where apply_time: LocalHistogram, key_buffer: Vec, + + // Whether to disable WAL. + disable_wal: bool, + + /// A general apply progress for a delegate is: + /// `prepare_for` -> `commit` [-> `commit` ...] -> `finish_for`. + /// Sometimes an `ApplyRes` is created with an applied_index, but data + /// before the applied index is still not written to kvdb. Let's call the + /// `ApplyRes` uncommitted. Data will finally be written to kvdb in + /// `flush`. + uncommitted_res_count: usize, + + enable_v2_compatible_learner: bool, } impl ApplyContext @@ -434,13 +485,14 @@ where priority: Priority, ) -> ApplyContext { let kv_wb = engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); + ApplyContext { tag, timer: None, host, importer, region_scheduler, - engine: engine.clone(), + engine, router, notifier, kv_wb, @@ -453,9 +505,11 @@ where committed_count: 0, sync_log_hint: false, use_delete_range: cfg.use_delete_range, - perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), + perf_context: EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, + yield_msg_size: cfg.apply_yield_write_size.0, delete_ssts: vec![], + pending_delete_ssts: vec![], store_id, pending_create_peers, priority, @@ -465,6 +519,9 @@ where apply_wait: APPLY_TASK_WAIT_TIME_HISTOGRAM.local(), apply_time: APPLY_TIME_HISTOGRAM.local(), key_buffer: Vec::with_capacity(1024), + disable_wal: false, + uncommitted_res_count: 0, + enable_v2_compatible_learner: cfg.enable_v2_compatible_learner, } } @@ -478,13 +535,14 @@ where .push_batch(&delegate.observe_info, delegate.region.get_id()); } - /// Commits all changes have done for delegate. `persistent` indicates whether - /// write the changes into rocksdb. + /// Commits all changes have done for delegate. `persistent` indicates + /// whether write the changes into rocksdb. /// - /// This call is valid only when it's between a `prepare_for` and `finish_for`. + /// This call is valid only when it's between a `prepare_for` and + /// `finish_for`. pub fn commit(&mut self, delegate: &mut ApplyDelegate) { if delegate.last_flush_applied_index < delegate.apply_state.get_applied_index() { - delegate.write_apply_state(self.kv_wb_mut()); + delegate.maybe_write_apply_state(self); } self.commit_opt(delegate, true); } @@ -492,7 +550,9 @@ where fn commit_opt(&mut self, delegate: &mut ApplyDelegate, persistent: bool) { delegate.update_metrics(self); if persistent { - self.write_to_db(); + if let (_, Some(seqno)) = self.write_to_db() { + delegate.unfinished_write_seqno.push(seqno); + } self.prepare_for(delegate); delegate.last_flush_applied_index = delegate.apply_state.get_applied_index() } @@ -502,8 +562,9 @@ where /// Writes all the changes into RocksDB. /// If it returns true, all pending writes are persisted in engines. - pub fn write_to_db(&mut self) -> bool { - let need_sync = self.sync_log_hint; + pub fn write_to_db(&mut self) -> (bool, Option) { + let need_sync = self.sync_log_hint && !self.disable_wal; + let mut seqno = None; // There may be put and delete requests after ingest request in the same fsm. // To guarantee the correct order, we must ingest the pending_sst first, and // then persist the kv write batch to engine. @@ -520,19 +581,36 @@ where self.pending_ssts = vec![]; } if !self.kv_wb_mut().is_empty() { + self.perf_context.start_observe(); let mut write_opts = engine_traits::WriteOptions::new(); write_opts.set_sync(need_sync); - self.kv_wb().write_opt(&write_opts).unwrap_or_else(|e| { + write_opts.set_disable_wal(self.disable_wal); + if self.disable_wal { + let sn = SequenceNumber::pre_write(); + seqno = Some(sn); + } + let seq = self.kv_wb_mut().write_opt(&write_opts).unwrap_or_else(|e| { panic!("failed to write to engine: {:?}", e); }); - self.perf_context.report_metrics(); + if let Some(seqno) = seqno.as_mut() { + seqno.post_write(seq) + } + let trackers: Vec<_> = self + .applied_batch + .cb_batch + .iter() + .flat_map(|(cb, _)| cb.write_trackers()) + .flat_map(|trackers| trackers.as_tracker_token()) + .collect(); + self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; let data_size = self.kv_wb().data_size(); if data_size > APPLY_WB_SHRINK_SIZE { // Control the memory usage for the WriteBatch. self.kv_wb = self.engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); } else { - // Clear data, reuse the WriteBatch, this can reduce memory allocations and deallocations. + // Clear data, reuse the WriteBatch, this can reduce memory allocations and + // deallocations. self.kv_wb_mut().clear(); } self.kv_wb_last_bytes = 0; @@ -552,23 +630,28 @@ where batch_max_level, mut cb_batch, } = mem::replace(&mut self.applied_batch, ApplyCallbackBatch::new()); - // Call it before invoking callback for preventing Commit is executed before Prewrite is observed. + // Call it before invoking callback for preventing Commit is executed before + // Prewrite is observed. self.host .on_flush_applied_cmd_batch(batch_max_level, cmd_batch, &self.engine); // Invoke callbacks - let now = Instant::now(); + let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - if let Some(times) = cb.get_request_times() { - for t in times { - self.apply_time - .observe(duration_to_sec(now.saturating_duration_since(*t))); - } + for tracker in cb.write_trackers() { + tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); } self.apply_time.flush(); self.apply_wait.flush(); - need_sync + let res_count = self.uncommitted_res_count; + self.uncommitted_res_count = 0; + if let Some(seqno) = seqno { + for res in self.apply_res.iter_mut().rev().take(res_count) { + res.write_seqno.push(seqno); + } + } + (need_sync, seqno) } /// Finishes `Apply`s for the delegate. @@ -577,18 +660,31 @@ where delegate: &mut ApplyDelegate, results: VecDeque>, ) { - if !delegate.pending_remove { - delegate.write_apply_state(self.kv_wb_mut()); + if self.host.pre_persist(&delegate.region, true, None) { + if !delegate.pending_remove { + delegate.maybe_write_apply_state(self); + } + self.commit_opt(delegate, false); + } else { + debug!("do not persist when finish_for"; + "region" => ?delegate.region, + "tag" => &delegate.tag, + ); } - self.commit_opt(delegate, false); self.apply_res.push(ApplyRes { region_id: delegate.region_id(), apply_state: delegate.apply_state.clone(), + write_seqno: mem::take(&mut delegate.unfinished_write_seqno), exec_res: results, - metrics: delegate.metrics.clone(), - applied_index_term: delegate.applied_index_term, + metrics: mem::take(&mut delegate.metrics), + applied_term: delegate.applied_term, bucket_stat: delegate.buckets.clone().map(Box::new), }); + if !self.kv_wb().is_empty() { + // Pending writes not flushed, need to set seqno to following ApplyRes later + // after flushing + self.uncommitted_res_count += 1; + } } pub fn delta_bytes(&self) -> u64 { @@ -623,15 +719,16 @@ where // take raft log gc for example, we write kv WAL first, then write raft WAL, // if power failure happen, raft WAL may synced to disk, but kv WAL may not. // so we use sync-log flag here. - let is_synced = self.write_to_db(); + let (is_synced, _) = self.write_to_db(); if !self.apply_res.is_empty() { + fail_point!("before_nofity_apply_res"); let apply_res = mem::take(&mut self.apply_res); self.notifier.notify(apply_res); } let elapsed = t.saturating_elapsed(); - STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(elapsed) as f64); + STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(elapsed)); for mut inspector in std::mem::take(&mut self.pending_latency_inspect) { inspector.record_apply_process(elapsed); inspector.finish(); @@ -649,7 +746,7 @@ where } /// Calls the callback of `cmd` when the Region is removed. -fn notify_region_removed(region_id: u64, peer_id: u64, mut cmd: PendingCmd) { +fn notify_region_removed(region_id: u64, peer_id: u64, mut cmd: PendingCmd) { debug!( "region is removed, notify commands"; "region_id" => region_id, @@ -660,10 +757,10 @@ fn notify_region_removed(region_id: u64, peer_id: u64, mut cmd: PendingCmd) { +pub fn notify_req_region_removed(region_id: u64, cb: impl ErrorCallback) { let region_not_found = Error::RegionNotFound(region_id); let resp = cmd_resp::new_error(region_not_found); - cb.invoke_with_response(resp); + cb.report_error(resp); } /// Calls the callback of `cmd` when it can not be processed further. @@ -671,7 +768,7 @@ fn notify_stale_command( region_id: u64, peer_id: u64, term: u64, - mut cmd: PendingCmd, + mut cmd: PendingCmd, ) { info!( "command is stale, skip"; @@ -683,15 +780,15 @@ fn notify_stale_command( notify_stale_req(term, cmd.cb.take().unwrap()); } -pub fn notify_stale_req(term: u64, cb: Callback) { +pub fn notify_stale_req(term: u64, cb: impl ErrorCallback) { let resp = cmd_resp::err_resp(Error::StaleCommand, term); - cb.invoke_with_response(resp); + cb.report_error(resp); } -pub fn notify_stale_req_with_msg(term: u64, msg: String, cb: Callback) { +pub fn notify_stale_req_with_msg(term: u64, msg: String, cb: impl ErrorCallback) { let mut resp = cmd_resp::err_resp(Error::StaleCommand, term); resp.mut_header().mut_error().set_message(msg); - cb.invoke_with_response(resp); + cb.report_error(resp); } /// Checks if a write is needed to be issued before handling the command. @@ -738,9 +835,9 @@ fn has_high_latency_operation(cmd: &RaftCmdRequest) -> bool { fn should_sync_log(cmd: &RaftCmdRequest) -> bool { if cmd.has_admin_request() { if cmd.get_admin_request().get_cmd_type() == AdminCmdType::CompactLog { - // We do not need to sync WAL before compact log, because this request will send a msg to - // raft_gc_log thread to delete the entries before this index instead of deleting them in - // apply thread directly. + // We do not need to sync WAL before compact log, because this request will send + // a msg to raft_gc_log thread to delete the entries before this + // index instead of deleting them in apply thread directly. return false; } return true; @@ -758,6 +855,43 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { false } +fn can_witness_skip(entry: &Entry) -> bool { + // need to handle ConfChange entry type + if entry.get_entry_type() != EntryType::EntryNormal { + return false; + } + + // HACK: check admin request field in serialized data from `RaftCmdRequest` + // without deserializing all. It's done by checking the existence of the + // field number of `admin_request`. + // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in + // `raft_cmdpb.rs` for reference. + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return true; + } + let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); + // Header field is of number 1 + if field_number == 1 { + if wire_type != WireType::WireTypeLengthDelimited { + panic!("unexpected wire type"); + } + let len = is.read_raw_varint32().unwrap(); + // skip parsing the content of `Header` + is.consume(len as usize); + // read next field number + (field_number, _) = is.read_tag_unpack().unwrap(); + } + + // `Requests` field is of number 2 and `AdminRequest` field is of number 3. + // - If the next field is 2, there must be no admin request as in one + // `RaftCmdRequest`, either requests or admin_request is filled. + // - If the next field is 3, it's exactly an admin request. + // - If the next field is others, neither requests nor admin_request is filled, + // so there is no admin request. + field_number != 3 +} + /// A struct that stores the state related to Merge. /// /// When executing a `CommitMerge`, the source peer may have not applied @@ -768,9 +902,9 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { /// this struct. /// TODO: check whether generator/coroutine is a good choice in this case. struct WaitSourceMergeState { - /// A flag that indicates whether the source peer has applied to the required - /// index. If the source peer is ready, this flag should be set to the region id - /// of source peer. + /// A flag that indicates whether the source peer has applied to the + /// required index. If the source peer is ready, this flag should be set + /// to the region id of source peer. logs_up_to_date: Arc, } @@ -837,48 +971,56 @@ pub struct ApplyDelegate where EK: KvEngine, { - /// The ID of the peer. - id: u64, /// The term of the Region. term: u64, /// The Region information of the peer. region: Region, + /// The Peer information. + peer: metapb::Peer, /// Peer_tag, "[region region_id] peer_id". tag: String, /// If the delegate should be stopped from polling. - /// A delegate can be stopped in conf change, merge or requested by destroy message. + /// A delegate can be stopped in conf change, merge or requested by destroy + /// message. stopped: bool, /// The start time of the current round to execute commands. handle_start: Option, - /// Set to true when removing itself because of `ConfChangeType::RemoveNode`, and then - /// any following committed logs in same Ready should be applied failed. + /// Set to true when removing itself because of + /// `ConfChangeType::RemoveNode`, and then any following committed logs + /// in same Ready should be applied failed. pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + wait_data: bool, + /// The commands waiting to be committed and applied - pending_cmds: PendingCmdQueue, + pending_cmds: PendingCmdQueue>, /// The counter of pending request snapshots. See more in `Peer`. pending_request_snapshot_count: Arc, - /// Indicates the peer is in merging, if that compact log won't be performed. + /// Indicates the peer is in merging, if that compact log won't be + /// performed. is_merging: bool, /// Records the epoch version after the last merge. last_merge_version: u64, yield_state: Option>, - /// A temporary state that keeps track of the progress of the source peer state when - /// CommitMerge is unable to be executed. + /// A temporary state that keeps track of the progress of the source peer + /// state when CommitMerge is unable to be executed. wait_merge_state: Option, // ID of last region that reports ready. ready_source_region_id: u64, - /// TiKV writes apply_state to KV RocksDB, in one write batch together with kv data. + /// TiKV writes apply_state to KV RocksDB, in one write batch together with + /// kv data. /// - /// If we write it to Raft RocksDB, apply_state and kv data (Put, Delete) are in - /// separate WAL file. When power failure, for current raft log, apply_index may synced - /// to file, but KV data may not synced to file, so we will lose data. + /// If we write it to Raft RocksDB, apply_state and kv data (Put, Delete) + /// are in separate WAL file. When power failure, for current raft log, + /// apply_index may synced to file, but KV data may not synced to file, + /// so we will lose data. apply_state: RaftApplyState, /// The term of the raft log at applied index. - applied_index_term: u64, + applied_term: u64, /// The latest flushed applied index. last_flush_applied_index: u64, @@ -888,8 +1030,9 @@ where /// The local metrics, and it will be flushed periodically. metrics: ApplyMetrics, - /// Priority in batch system. When applying some commands which have high latency, - /// we decrease the priority of current fsm to reduce the impact on other normal commands. + /// Priority in batch system. When applying some commands which have high + /// latency, we decrease the priority of current fsm to reduce the + /// impact on other normal commands. priority: Priority, /// To fetch Raft entries for applying if necessary. @@ -899,6 +1042,8 @@ where trace: ApplyMemoryTrace, buckets: Option, + + unfinished_write_seqno: Vec, } impl ApplyDelegate @@ -907,13 +1052,14 @@ where { fn from_registration(reg: Registration) -> ApplyDelegate { ApplyDelegate { - id: reg.id, tag: format!("[region {}] {}", reg.region.get_id(), reg.id), + peer: find_peer_by_id(®.region, reg.id).unwrap().clone(), region: reg.region, pending_remove: false, + wait_data: false, last_flush_applied_index: reg.apply_state.get_applied_index(), apply_state: reg.apply_state, - applied_index_term: reg.applied_index_term, + applied_term: reg.applied_term, term: reg.term, stopped: false, handle_start: None, @@ -931,6 +1077,7 @@ where raft_engine: reg.raft_engine, trace: ApplyMemoryTrace::default(), buckets: None, + unfinished_write_seqno: vec![], } } @@ -939,10 +1086,11 @@ where } pub fn id(&self) -> u64 { - self.id + self.peer.get_id() } - /// Handles all the committed_entries, namely, applies the committed entries. + /// Handles all the committed_entries, namely, applies the committed + /// entries. fn handle_raft_committed_entries( &mut self, apply_ctx: &mut ApplyContext, @@ -952,9 +1100,9 @@ where return; } apply_ctx.prepare_for(self); - // If we send multiple ConfChange commands, only first one will be proposed correctly, - // others will be saved as a normal entry with no data, so we must re-propose these - // commands again. + // If we send multiple ConfChange commands, only first one will be proposed + // correctly, others will be saved as a normal entry with no data, so we + // must re-propose these commands again. apply_ctx.committed_count += committed_entries_drainer.len(); let mut results = VecDeque::new(); while let Some(entry) = committed_entries_drainer.next() { @@ -966,16 +1114,18 @@ where let expect_index = self.apply_state.get_applied_index() + 1; if expect_index != entry.get_index() { panic!( - "{} expect index {}, but got {}", + "{} expect index {}, but got {}, ctx {}", self.tag, expect_index, - entry.get_index() + entry.get_index(), + apply_ctx.tag, ); } - // NOTE: before v5.0, `EntryType::EntryConfChangeV2` entry is handled by `unimplemented!()`, - // which can break compatibility (i.e. old version tikv running on data written by new version tikv), - // but PD will reject old version tikv join the cluster, so this should not happen. + // NOTE: before v5.0, `EntryType::EntryConfChangeV2` entry is handled by + // `unimplemented!()`, which can break compatibility (i.e. old version tikv + // running on data written by new version tikv), but PD will reject old version + // tikv join the cluster, so this should not happen. let res = match entry.get_entry_type() { EntryType::EntryNormal => self.handle_raft_entry_normal(apply_ctx, &entry), EntryType::EntryConfChange | EntryType::EntryConfChangeV2 => { @@ -985,7 +1135,13 @@ where match res { ApplyResult::None => {} - ApplyResult::Res(res) => results.push_back(res), + ApplyResult::Res(res) => { + results.push_back(res); + if self.wait_data { + apply_ctx.committed_count -= committed_entries_drainer.len(); + break; + } + } ApplyResult::Yield | ApplyResult::WaitMergeSource(_) => { // Both cancel and merge will yield current processing. apply_ctx.committed_count -= committed_entries_drainer.len() + 1; @@ -1033,6 +1189,13 @@ where }); } + fn maybe_write_apply_state(&self, apply_ctx: &mut ApplyContext) { + let can_write = apply_ctx.host.pre_write_apply_state(&self.region); + if can_write { + self.write_apply_state(apply_ctx.kv_wb_mut()); + } + } + fn handle_raft_entry_normal( &mut self, apply_ctx: &mut ApplyContext, @@ -1049,50 +1212,71 @@ where let data = entry.get_data(); if !data.is_empty() { - let cmd = util::parse_data_at(data, index, &self.tag); - - if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { - self.priority = Priority::Low; - } - let mut has_unflushed_data = - self.last_flush_applied_index != self.apply_state.get_applied_index(); - if has_unflushed_data && should_write_to_engine(&cmd) - || apply_ctx.kv_wb().should_write_to_engine() - { - apply_ctx.commit(self); - if let Some(start) = self.handle_start.as_ref() { - if start.saturating_elapsed() >= apply_ctx.yield_duration { + if !self.peer.is_witness || !can_witness_skip(entry) { + let cmd = match util::parse_raft_cmd_request(data, index, term, &self.tag) { + util::RaftCmd::V1(cmd) => cmd, + util::RaftCmd::V2(simple_write_decoder) => { + if !apply_ctx.enable_v2_compatible_learner { + panic!( + "{} can not handle v2 command when enable_v2_compatible_learner is false", + self.tag + ); + } + simple_write_decoder.to_raft_cmd_request() + } + }; + if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { + self.priority = Priority::Low; + } + let mut has_unflushed_data = + self.last_flush_applied_index != self.apply_state.get_applied_index(); + if (has_unflushed_data && should_write_to_engine(&cmd) + || apply_ctx.kv_wb().should_write_to_engine()) + && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) + { + apply_ctx.commit(self); + if self.metrics.written_bytes >= apply_ctx.yield_msg_size + || self + .handle_start + .as_ref() + .map_or(Duration::ZERO, Instant::saturating_elapsed) + >= apply_ctx.yield_duration + { return ApplyResult::Yield; } + has_unflushed_data = false; + } + if self.priority != apply_ctx.priority { + if has_unflushed_data { + apply_ctx.commit(self); + } + return ApplyResult::Yield; } - has_unflushed_data = false; + + return self.process_raft_cmd(apply_ctx, index, term, cmd); } - if self.priority != apply_ctx.priority { - if has_unflushed_data { - apply_ctx.commit(self); + } else { + // we should observe empty cmd, aka leader change, + // read index during confchange, or other situations. + apply_ctx.host.on_empty_cmd(&self.region, index, term); + + // 1. When a peer become leader, it will send an empty entry. + // 2. When a leader tries to read index during transferring leader, + // it will also propose an empty entry. But that entry will not contain + // any associated callback. So no need to clear callback. + while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { + if let Some(cb) = cmd.cb.take() { + apply_ctx + .applied_batch + .push_cb(cb, cmd_resp::err_resp(Error::StaleCommand, term)); } - return ApplyResult::Yield; } - - return self.process_raft_cmd(apply_ctx, index, term, cmd); } - // TOOD(cdc): should we observe empty cmd, aka leader change? self.apply_state.set_applied_index(index); - self.applied_index_term = term; + self.applied_term = term; assert!(term > 0); - // 1. When a peer become leader, it will send an empty entry. - // 2. When a leader tries to read index during transferring leader, - // it will also propose an empty entry. But that entry will not contain - // any associated callback. So no need to clear callback. - while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { - if let Some(cb) = cmd.cb.take() { - apply_ctx - .applied_batch - .push_cb(cb, cmd_resp::err_resp(Error::StaleCommand, term)); - } - } ApplyResult::None } @@ -1178,7 +1362,7 @@ where apply_ctx: &mut ApplyContext, index: u64, term: u64, - cmd: RaftCmdRequest, + req: RaftCmdRequest, ) -> ApplyResult { if index == 0 { panic!( @@ -1188,10 +1372,10 @@ where } // Set sync log hint if the cmd requires so. - apply_ctx.sync_log_hint |= should_sync_log(&cmd); + apply_ctx.sync_log_hint |= should_sync_log(&req); - apply_ctx.host.pre_apply(&self.region, &cmd); - let (mut resp, exec_result) = self.apply_raft_cmd(apply_ctx, index, term, &cmd); + apply_ctx.host.pre_apply(&self.region, &req); + let (mut cmd, exec_result, should_write) = self.apply_raft_cmd(apply_ctx, index, term, req); if let ApplyResult::WaitMergeSource(_) = exec_result { return exec_result; } @@ -1205,84 +1389,165 @@ where // TODO: if we have exec_result, maybe we should return this callback too. Outer // store will call it after handing exec result. - cmd_resp::bind_term(&mut resp, self.term); - let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd)); - let cmd = Cmd::new(index, cmd, resp); + cmd_resp::bind_term(&mut cmd.response, self.term); + let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd.request)); apply_ctx .applied_batch .push(cmd_cb, cmd, &self.observe_info, self.region_id()); + if should_write { + // An observer shall prevent a write_apply_state here by not return true + // when `post_exec`. + self.write_apply_state(apply_ctx.kv_wb_mut()); + apply_ctx.commit(self); + } exec_result } /// Applies raft command. /// /// An apply operation can fail in the following situations: - /// 1. it encounters an error that will occur on all stores, it can continue - /// applying next entry safely, like epoch not match for example; - /// 2. it encounters an error that may not occur on all stores, in this case - /// we should try to apply the entry again or panic. Considering that this - /// usually due to disk operation fail, which is rare, so just panic is ok. + /// - it encounters an error that will occur on all stores, it can + /// continue applying next entry safely, like epoch not match for + /// example; + /// - it encounters an error that may not occur on all stores, in this + /// case we should try to apply the entry again or panic. Considering + /// that this usually due to disk operation fail, which is rare, so just + /// panic is ok. fn apply_raft_cmd( &mut self, ctx: &mut ApplyContext, index: u64, term: u64, - req: &RaftCmdRequest, - ) -> (RaftCmdResponse, ApplyResult) { + req: RaftCmdRequest, + ) -> (Cmd, ApplyResult, bool) { // if pending remove, apply should be aborted already. assert!(!self.pending_remove); - ctx.exec_log_index = index; - ctx.exec_log_term = term; - ctx.kv_wb_mut().set_save_point(); - let mut origin_epoch = None; // Remember if the raft cmd fails to be applied, it must have no side effects. // E.g. `RaftApplyState` must not be changed. - let (resp, exec_result) = match self.exec_raft_cmd(ctx, req) { - Ok(a) => { - ctx.kv_wb_mut().pop_save_point().unwrap(); - if req.has_admin_request() { - origin_epoch = Some(self.region.get_region_epoch().clone()); - } - a + + let mut origin_epoch = None; + let (resp, exec_result) = if ctx.host.pre_exec(&self.region, &req, index, term) { + // One of the observers want to filter execution of the command. + let mut resp = RaftCmdResponse::default(); + if !req.get_header().get_uuid().is_empty() { + let uuid = req.get_header().get_uuid().to_vec(); + resp.mut_header().set_uuid(uuid); } - Err(e) => { - // clear dirty values. - ctx.kv_wb_mut().rollback_to_save_point().unwrap(); - match e { - Error::EpochNotMatch(..) => debug!( - "epoch not match"; - "region_id" => self.region_id(), - "peer_id" => self.id(), - "err" => ?e - ), - _ => error!(?e; - "execute raft command"; - "region_id" => self.region_id(), - "peer_id" => self.id(), - ), + (resp, ApplyResult::None) + } else { + ctx.exec_log_index = index; + ctx.exec_log_term = term; + ctx.kv_wb_mut().set_save_point(); + let (resp, exec_result) = match self.exec_raft_cmd(ctx, &req) { + Ok(a) => { + ctx.kv_wb_mut().pop_save_point().unwrap(); + if req.has_admin_request() { + origin_epoch = Some(self.region.get_region_epoch().clone()); + } + a } - (cmd_resp::new_error(e), ApplyResult::None) - } + Err(e) => { + // clear dirty values. + ctx.kv_wb_mut().rollback_to_save_point().unwrap(); + match e { + Error::EpochNotMatch(..) => debug!( + "epoch not match"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "err" => ?e + ), + Error::FlashbackInProgress(..) => debug!( + "flashback is in process"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "err" => ?e + ), + Error::FlashbackNotPrepared(..) => debug!( + "flashback is not prepared"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "err" => ?e + ), + _ => error!(?e; + "execute raft command"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + ), + } + (cmd_resp::new_error(e), ApplyResult::None) + } + }; + (resp, exec_result) }; + + let cmd = Cmd::new(index, term, req, resp); if let ApplyResult::WaitMergeSource(_) = exec_result { - return (resp, exec_result); + return (cmd, exec_result, false); } self.apply_state.set_applied_index(index); - self.applied_index_term = term; + self.applied_term = term; + + let (modified_region, mut pending_handle_ssts) = match exec_result { + ApplyResult::Res(ref e) => match e { + ExecResult::SplitRegion { ref derived, .. } => (Some(derived.clone()), None), + ExecResult::PrepareMerge { ref region, .. } => (Some(region.clone()), None), + ExecResult::CommitMerge { ref region, .. } => (Some(region.clone()), None), + ExecResult::RollbackMerge { ref region, .. } => (Some(region.clone()), None), + ExecResult::IngestSst { ref ssts } => (None, Some(ssts.clone())), + ExecResult::SetFlashbackState { ref region } => (Some(region.clone()), None), + _ => (None, None), + }, + _ => (None, None), + }; + let mut apply_ctx_info = ApplyCtxInfo { + pending_handle_ssts: &mut pending_handle_ssts, + delete_ssts: &mut ctx.delete_ssts, + pending_delete_ssts: &mut ctx.pending_delete_ssts, + }; + let should_write = ctx.host.post_exec( + &self.region, + &cmd, + &self.apply_state, + &RegionState { + peer_id: self.id(), + pending_remove: self.pending_remove, + modified_region, + }, + &mut apply_ctx_info, + ); + match pending_handle_ssts { + None => (), + Some(mut v) => { + if !v.is_empty() { + // All elements in `pending_handle_ssts` should be moved into either + // `delete_ssts` or `pending_delete_ssts`, once handled by by any of the + // `post_exec` observers. So a non-empty + // `pending_handle_ssts` here indicates no `post_exec` handled. + ctx.delete_ssts.append(&mut v); + } + RAFT_APPLYING_SST_GAUGE + .with_label_values(&["pending_delete"]) + .set(ctx.pending_delete_ssts.len() as i64); + } + } if let ApplyResult::Res(ref exec_result) = exec_result { match *exec_result { ExecResult::ChangePeer(ref cp) => { self.region = cp.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } } ExecResult::ComputeHash { .. } | ExecResult::VerifyHash { .. } | ExecResult::CompactLog { .. } | ExecResult::DeleteRange { .. } | ExecResult::IngestSst { .. } - | ExecResult::TransferLeader { .. } => {} + | ExecResult::TransferLeader { .. } + | ExecResult::HasPendingCompactCmd(..) => {} ExecResult::SplitRegion { ref derived, .. } => { self.region = derived.clone(); self.metrics.size_diff_hint = 0; @@ -1300,12 +1565,22 @@ where self.region = region.clone(); self.is_merging = false; } + ExecResult::SetFlashbackState { ref region } => { + self.region = region.clone(); + } + ExecResult::BatchSwitchWitness(ref switches) => { + self.region = switches.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } + } } } if let Some(epoch) = origin_epoch { - let cmd_type = req.get_admin_request().get_cmd_type(); + let cmd_type = cmd.request.get_admin_request().get_cmd_type(); let epoch_state = admin_cmd_epoch_lookup(cmd_type); - // The change-epoch behavior **MUST BE** equal to the settings in `admin_cmd_epoch_lookup` + // The change-epoch behavior **MUST BE** equal to the settings in + // `admin_cmd_epoch_lookup` if (epoch_state.change_ver && epoch.get_version() == self.region.get_region_epoch().get_version()) || (epoch_state.change_conf_ver @@ -1314,7 +1589,7 @@ where panic!( "{} apply admin cmd {:?} but epoch change is not expected, epoch state {:?}, before {:?}, after {:?}", self.tag, - req, + cmd.request, epoch_state, epoch, self.region.get_region_epoch() @@ -1322,17 +1597,21 @@ where } } - (resp, exec_result) + (cmd, exec_result, should_write) } fn destroy(&mut self, apply_ctx: &mut ApplyContext) { self.stopped = true; apply_ctx.router.close(self.region_id()); + let id = self.id(); for cmd in self.pending_cmds.normals.drain(..) { - notify_region_removed(self.region.get_id(), self.id, cmd); + notify_region_removed(self.region.get_id(), id, cmd); } if let Some(cmd) = self.pending_cmds.conf_change.take() { - notify_region_removed(self.region.get_id(), self.id, cmd); + notify_region_removed(self.region.get_id(), id, cmd); + } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), id, cmd); } self.yield_state = None; @@ -1351,6 +1630,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_stale_command(region_id, peer_id, self.term, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), peer_id, cmd); + } } fn clear_all_commands_silently(&mut self) { @@ -1360,6 +1642,9 @@ where if let Some(mut cmd) = self.pending_cmds.conf_change.take() { cmd.cb.take(); } + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take(); + } } } @@ -1376,7 +1661,14 @@ where // Include region for epoch not match after merge may cause key not in range. let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; - check_region_epoch(req, &self.region, include_region)?; + check_req_region_epoch(req, &self.region, include_region)?; + check_flashback_state( + self.region.is_in_flashback, + self.region.flashback_start_ts, + req, + self.region_id(), + false, + )?; if req.has_admin_request() { self.exec_admin_cmd(ctx, req) } else { @@ -1411,10 +1703,14 @@ where AdminCmdType::TransferLeader => self.exec_transfer_leader(request, ctx.exec_log_term), AdminCmdType::ComputeHash => self.exec_compute_hash(ctx, request), AdminCmdType::VerifyHash => self.exec_verify_hash(ctx, request), - // TODO: is it backward compatible to add new cmd_type? AdminCmdType::PrepareMerge => self.exec_prepare_merge(ctx, request), AdminCmdType::CommitMerge => self.exec_commit_merge(ctx, request), AdminCmdType::RollbackMerge => self.exec_rollback_merge(ctx, request), + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + self.exec_flashback(ctx, request) + } + AdminCmdType::UpdateGcPeer => Err(box_err!("v2 only command and it's safe to skip")), + AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), }?; response.set_cmd_type(cmd_type); @@ -1492,7 +1788,6 @@ where }; dont_delete_ingested_sst_fp(); } - ctx.delete_ssts.append(&mut ssts.clone()); ApplyResult::Res(ExecResult::IngestSst { ssts }) } else { ApplyResult::None @@ -1565,7 +1860,8 @@ where keys::data_key_with_buffer(key, &mut ctx.key_buffer); let key = ctx.key_buffer.as_slice(); - // since size_diff_hint is not accurate, so we just skip calculate the value size. + // since size_diff_hint is not accurate, so we just skip calculate the value + // size. self.metrics.size_diff_hint -= key.len() as i64; if !req.get_delete().get_cf().is_empty() { let cf = req.get_delete().get_cf(); @@ -1714,24 +2010,56 @@ where mod confchange_cmd_metric { use super::*; - fn write_metric(cct: ConfChangeType, kind: &str) { - let metric = match cct { - ConfChangeType::AddNode => "add_peer", - ConfChangeType::RemoveNode => "remove_peer", - ConfChangeType::AddLearnerNode => "add_learner", + pub fn inc_all(cct: ConfChangeType) { + let metrics = match cct { + ConfChangeType::AddNode => &PEER_ADMIN_CMD_COUNTER.add_peer, + ConfChangeType::RemoveNode => &PEER_ADMIN_CMD_COUNTER.remove_peer, + ConfChangeType::AddLearnerNode => &PEER_ADMIN_CMD_COUNTER.add_learner, }; - PEER_ADMIN_CMD_COUNTER_VEC - .with_label_values(&[metric, kind]) - .inc(); + metrics.all.inc(); } - pub fn inc_all(cct: ConfChangeType) { - write_metric(cct, "all") + pub fn inc_success(cct: ConfChangeType) { + let metrics = match cct { + ConfChangeType::AddNode => &PEER_ADMIN_CMD_COUNTER.add_peer, + ConfChangeType::RemoveNode => &PEER_ADMIN_CMD_COUNTER.remove_peer, + ConfChangeType::AddLearnerNode => &PEER_ADMIN_CMD_COUNTER.add_learner, + }; + metrics.success.inc(); } +} - pub fn inc_success(cct: ConfChangeType) { - write_metric(cct, "success") +pub fn validate_batch_split(req: &AdminRequest, region: &Region) -> Result<()> { + if req.get_splits().get_requests().is_empty() { + return Err(box_err!("missing split requests")); } + + let split_reqs: &[SplitRequest] = req.get_splits().get_requests(); + let mut last_key = region.get_start_key(); + for req in split_reqs { + let split_key = req.get_split_key(); + if split_key.is_empty() { + return Err(box_err!("missing split key")); + } + + if split_key <= last_key { + return Err(box_err!("invalid split request: {:?}", split_reqs)); + } + + if req.get_new_peer_ids().len() != region.get_peers().len() { + return Err(box_err!( + "invalid new peer id count, need {:?}, but got {:?}", + region.get_peers(), + req.get_new_peer_ids() + )); + } + + last_key = req.get_split_key(); + } + + util::check_key_in_region_exclusive(last_key, region)?; + + Ok(()) } // Admin commands related. @@ -1739,6 +2067,8 @@ impl ApplyDelegate where EK: KvEngine, { + // Legacy code for compatibility. All new conf changes are dispatched by + // ChangePeerV2 now. fn exec_change_peer( &mut self, ctx: &mut ApplyContext, @@ -1753,12 +2083,12 @@ where fail_point!( "apply_on_conf_change_1_3_1", - (self.id == 1 || self.id == 3) && self.region_id() == 1, + (self.id() == 1 || self.id() == 3) && self.region_id() == 1, |_| panic!("should not use return") ); fail_point!( "apply_on_conf_change_3_1", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| panic!("should not use return") ); fail_point!( @@ -1783,7 +2113,7 @@ where let add_ndoe_fp = || { fail_point!( "apply_on_add_node_1_2", - self.id == 2 && self.region_id() == 1, + self.id() == 2 && self.region_id() == 1, |_| {} ) }; @@ -1794,7 +2124,7 @@ where .inc(); let mut exists = false; - if let Some(p) = util::find_peer_mut(&mut region, store_id) { + if let Some(p) = find_peer_mut(&mut region, store_id) { exists = true; if !is_learner(p) || p.get_id() != peer.get_id() { error!( @@ -1834,7 +2164,7 @@ where .with_label_values(&["remove_peer", "all"]) .inc(); - if let Some(p) = util::remove_peer(&mut region, store_id) { + if let Some(p) = remove_peer(&mut region, store_id) { // Considering `is_learner` flag in `Peer` here is by design. if &p != peer { error!( @@ -1850,7 +2180,7 @@ where p )); } - if self.id == peer.get_id() { + if self.id() == peer.get_id() { // Remove ourself, we will destroy all region data later. // So we need not to apply following logs. self.stopped = true; @@ -1887,7 +2217,7 @@ where .with_label_values(&["add_learner", "all"]) .inc(); - if util::find_peer(®ion, store_id).is_some() { + if find_peer(®ion, store_id).is_some() { error!( "can't add duplicated learner"; "region_id" => self.region_id(), @@ -1995,7 +2325,7 @@ where confchange_cmd_metric::inc_all(change_type); - if let Some(exist_peer) = util::find_peer(®ion, store_id) { + if let Some(exist_peer) = find_peer(®ion, store_id) { let r = exist_peer.get_role(); if r == PeerRole::IncomingVoter || r == PeerRole::DemotingVoter { panic!( @@ -2004,7 +2334,7 @@ where ); } } - match (util::find_peer_mut(&mut region, store_id), change_type) { + match (find_peer_mut(&mut region, store_id), change_type) { (None, ConfChangeType::AddNode) => { let mut peer = peer.clone(); match kind { @@ -2043,6 +2373,7 @@ where // The peer is already the requested role || (role, change_type) == (PeerRole::Voter, ConfChangeType::AddNode) || (role, change_type) == (PeerRole::Learner, ConfChangeType::AddLearnerNode) + || exist_peer.get_is_witness() != peer.get_is_witness() { error!( "can't add duplicated peer"; @@ -2050,7 +2381,7 @@ where "peer_id" => self.id(), "peer" => ?peer, "exist peer" => ?exist_peer, - "confchnage type" => ?change_type, + "confchange type" => ?change_type, "region" => ?&self.region ); return Err(box_err!( @@ -2096,7 +2427,7 @@ where self.region )); } - match util::remove_peer(&mut region, store_id) { + match remove_peer(&mut region, store_id) { Some(p) => { if &p != peer { error!( @@ -2104,7 +2435,7 @@ where "region_id" => self.region_id(), "peer_id" => self.id(), "expect_peer" => ?peer, - "get_peeer" => ?p + "get_peer" => ?p ); return Err(box_err!( "remove unmatched peer: expect: {:?}, get {:?}, ignore", @@ -2112,7 +2443,7 @@ where p )); } - if self.id == peer.get_id() { + if self.id() == peer.get_id() { // Remove ourself, we will destroy all region data later. // So we need not to apply following logs. self.stopped = true; @@ -2182,9 +2513,9 @@ where .mut_splits() .set_right_derive(split.get_right_derive()); admin_req.mut_splits().mut_requests().push(split); - // This method is executed only when there are unapplied entries after being restarted. - // So there will be no callback, it's OK to return a response that does not matched - // with its request. + // This method is executed only when there are unapplied entries after being + // restarted. So there will be no callback, it's OK to return a response + // that does not matched with its request. self.exec_batch_split(ctx, &admin_req) } @@ -2196,44 +2527,21 @@ where fail_point!("apply_before_split"); fail_point!( "apply_before_split_1_3", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| { unreachable!() } ); PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); - let split_reqs = req.get_splits(); - let right_derive = split_reqs.get_right_derive(); - if split_reqs.get_requests().is_empty() { - return Err(box_err!("missing split requests")); - } let mut derived = self.region.clone(); - let new_region_cnt = split_reqs.get_requests().len(); - let mut regions = Vec::with_capacity(new_region_cnt + 1); - let mut keys: VecDeque> = VecDeque::with_capacity(new_region_cnt + 1); - for req in split_reqs.get_requests() { - let split_key = req.get_split_key(); - if split_key.is_empty() { - return Err(box_err!("missing split key")); - } - if split_key - <= keys - .back() - .map_or_else(|| derived.get_start_key(), Vec::as_slice) - { - return Err(box_err!("invalid split request: {:?}", split_reqs)); - } - if req.get_new_peer_ids().len() != derived.get_peers().len() { - return Err(box_err!( - "invalid new peer id count, need {:?}, but got {:?}", - derived.get_peers(), - req.get_new_peer_ids() - )); - } - keys.push_back(split_key.to_vec()); - } + validate_batch_split(req, &derived)?; - util::check_key_in_region(keys.back().unwrap(), &self.region)?; + let split_reqs = req.get_splits(); + let mut keys: VecDeque<_> = split_reqs + .get_requests() + .iter() + .map(|req| req.get_split_key().to_vec()) + .collect(); info!( "split region"; @@ -2242,20 +2550,28 @@ where "region" => ?derived, "keys" => %KeysInfoFormatter(keys.iter()), ); + + let new_region_cnt = split_reqs.get_requests().len(); let new_version = derived.get_region_epoch().get_version() + new_region_cnt as u64; derived.mut_region_epoch().set_version(new_version); + + let right_derive = split_reqs.get_right_derive(); + let mut regions = Vec::with_capacity(new_region_cnt + 1); // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. if right_derive { - // So the range of new regions is [old_start_key, split_key1, ..., last_split_key]. + // So the range of new regions is [old_start_key, split_key1, ..., + // last_split_key]. keys.push_front(derived.get_start_key().to_vec()); } else { - // So the range of new regions is [split_key1, ..., last_split_key, old_end_key]. + // So the range of new regions is [split_key1, ..., last_split_key, + // old_end_key]. keys.push_back(derived.get_end_key().to_vec()); derived.set_end_key(keys.front().unwrap().to_vec()); regions.push(derived.clone()); } + // Init split regions' meta info let mut new_split_regions: HashMap = HashMap::default(); for req in split_reqs.get_requests() { let mut new_region = Region::default(); @@ -2274,7 +2590,7 @@ where new_split_regions.insert( new_region.get_id(), NewSplitPeer { - peer_id: util::find_peer(&new_region, ctx.store_id).unwrap().get_id(), + peer_id: find_peer(&new_region, ctx.store_id).unwrap().get_id(), result: None, }, ); @@ -2286,6 +2602,11 @@ where regions.push(derived.clone()); } + // Generally, a peer is created in pending_create_peers when it is + // created by raft_message (or by split here) and removed from + // pending_create_peers when it has applied the snapshot. So, if the + // peer of the split region is already created by raft_message in + // pending_create_peers ,we decide to replace it. let mut replace_regions = HashSet::default(); { let mut pending_create_peers = ctx.pending_create_peers.lock().unwrap(); @@ -2331,6 +2652,9 @@ where self.tag, region_id, new_split_peer.peer_id, state ) } + // If the peer's state is already persisted, add some info in + // new_split_peer.result so that we will skip this region in later + // executions. already_exist_regions.push((*region_id, new_split_peer.peer_id)); new_split_peer.result = Some(format!("state {:?} exist in kv engine", state)); } @@ -2386,7 +2710,7 @@ where fail_point!( "apply_after_split_1_3", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| { unreachable!() } ); @@ -2416,7 +2740,7 @@ where let prepare_merge = req.get_prepare_merge(); let index = prepare_merge.get_min_index(); - let first_index = peer_storage::first_index(&self.apply_state); + let first_index = entry_storage::first_index(&self.apply_state); if index < first_index { // We filter `CompactLog` command before. panic!( @@ -2466,15 +2790,20 @@ where // The target peer should send missing log entries to the source peer. // // So, the merge process order would be: - // 1. `exec_commit_merge` in target apply fsm and send `CatchUpLogs` to source peer fsm - // 2. `on_catch_up_logs_for_merge` in source peer fsm - // 3. if the source peer has already executed the corresponding `on_ready_prepare_merge`, set pending_remove and jump to step 6 - // 4. ... (raft append and apply logs) - // 5. `on_ready_prepare_merge` in source peer fsm and set pending_remove (means source region has finished applying all logs) - // 6. `logs_up_to_date_for_merge` in source apply fsm (destroy its apply fsm and send Noop to trigger the target apply fsm) - // 7. resume `exec_commit_merge` in target apply fsm - // 8. `on_ready_commit_merge` in target peer fsm and send `MergeResult` to source peer fsm - // 9. `on_merge_result` in source peer fsm (destroy itself) + // - `exec_commit_merge` in target apply fsm and send `CatchUpLogs` to source + // peer fsm + // - `on_catch_up_logs_for_merge` in source peer fsm + // - if the source peer has already executed the corresponding + // `on_ready_prepare_merge`, set pending_remove and jump to step 6 + // - ... (raft append and apply logs) + // - `on_ready_prepare_merge` in source peer fsm and set pending_remove (means + // source region has finished applying all logs) + // - `logs_up_to_date_for_merge` in source apply fsm (destroy its apply fsm and + // send Noop to trigger the target apply fsm) + // - resume `exec_commit_merge` in target apply fsm + // - `on_ready_commit_merge` in target peer fsm and send `MergeResult` to source + // peer fsm + // - `on_merge_result` in source peer fsm (destroy itself) fn exec_commit_merge( &mut self, ctx: &mut ApplyContext, @@ -2485,7 +2814,7 @@ where let apply_before_commit_merge = || { fail_point!( "apply_before_commit_merge_except_1_4", - self.region_id() == 1 && self.id != 4, + self.region_id() == 1 && self.id() != 4, |_| {} ); }; @@ -2652,15 +2981,118 @@ where )) } + fn exec_flashback( + &self, + ctx: &mut ApplyContext, + req: &AdminRequest, + ) -> Result<(AdminResponse, ApplyResult)> { + let is_in_flashback = req.get_cmd_type() == AdminCmdType::PrepareFlashback; + // Modify the region meta in memory. + let mut region = self.region.clone(); + region.set_is_in_flashback(is_in_flashback); + region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); + // Modify the `RegionLocalState` persisted in disk. + write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { + panic!( + "{} failed to change the flashback state to {} for region {:?}: {:?}", + self.tag, is_in_flashback, region, e + ) + }); + + match req.get_cmd_type() { + AdminCmdType::PrepareFlashback => { + PEER_ADMIN_CMD_COUNTER.prepare_flashback.success.inc(); + } + AdminCmdType::FinishFlashback => { + PEER_ADMIN_CMD_COUNTER.finish_flashback.success.inc(); + } + _ => unreachable!(), + } + Ok(( + AdminResponse::default(), + ApplyResult::Res(ExecResult::SetFlashbackState { region }), + )) + } + + // When the first return value is true, it means that we have updated + // `RaftApplyState`, and the caller needs to do persistence. + fn try_compact_log( + &mut self, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) -> Result<(bool, Option>)> { + PEER_ADMIN_CMD_COUNTER.compact.all.inc(); + let first_index = entry_storage::first_index(&self.apply_state); + + if self.is_merging { + info!( + "in merging mode, skip compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + return Ok((false, None)); + } + + // When the witness restarted, the pending compact cmd has been lost, so use + // `voter_replicated_index` for gc to avoid log accumulation. + if !self.pending_cmds.has_compact() { + if voter_replicated_index <= first_index { + debug!( + "voter_replicated_index <= first index, no need to compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "compact_index" => voter_replicated_index, + "first_index" => first_index, + ); + return Ok((false, Some(ExecResult::HasPendingCompactCmd(false)))); + } + // compact failure is safe to be omitted, no need to assert. + compact_raft_log( + &self.tag, + &mut self.apply_state, + voter_replicated_index, + voter_replicated_term, + )?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + return Ok((true, Some(ExecResult::HasPendingCompactCmd(false)))); + } + + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + // compact failure is safe to be omitted, no need to assert. + compact_raft_log(&self.tag, &mut self.apply_state, cmd.index, cmd.term)?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + Ok(( + true, + Some(ExecResult::CompactLog { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: self.pending_cmds.has_compact(), + }), + )) + } + None => { + info!( + "latest voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + Ok((false, None)) + } + } + } + fn exec_compact_log( &mut self, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { PEER_ADMIN_CMD_COUNTER.compact.all.inc(); - let compact_index = req.get_compact_log().get_compact_index(); + let mut compact_index = req.get_compact_log().get_compact_index(); let resp = AdminResponse::default(); - let first_index = peer_storage::first_index(&self.apply_state); + let first_index = entry_storage::first_index(&self.apply_state); if compact_index <= first_index { debug!( "compact index <= first index, no need to compact"; @@ -2681,7 +3113,7 @@ where return Ok((resp, ApplyResult::None)); } - let compact_term = req.get_compact_log().get_compact_term(); + let mut compact_term = req.get_compact_log().get_compact_term(); // TODO: add unit tests to cover all the message integrity checks. if compact_term == 0 { info!( @@ -2696,6 +3128,44 @@ where )); } + let voter_replicated_index = req.get_compact_log().get_voter_replicated_index(); + // If there is any voter lagging behind, the log truncation of the witness + // shouldn't be triggered even if it's force mode(raft log size/count exceeds + // the threshold or raft engine purge), otherwise the witness can't help the + // lagging voter catch up logs when leader is down. In this situation Compact + // index should be queued. If witness receives a voter_replicated_index + // that is larger than the pending compact index, logs can be deleted. + if self.peer.is_witness { + if voter_replicated_index < compact_index { + self.pending_cmds.push_compact(PendingCmd::new( + compact_index, + compact_term, + Callback::None, + )); + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + compact_index = cmd.index; + compact_term = cmd.term; + } + None => { + info!( + "voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "command" => ?req.get_compact_log() + ); + return Ok(( + resp, + ApplyResult::Res(ExecResult::HasPendingCompactCmd(true)), + )); + } + } + } else { + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take().unwrap(); + } + } + } // compact failure is safe to be omitted, no need to assert. compact_raft_log( &self.tag, @@ -2711,6 +3181,7 @@ where ApplyResult::Res(ExecResult::CompactLog { state: self.apply_state.get_truncated_state().clone(), first_index, + has_pending: self.pending_cmds.has_compact(), }), )) } @@ -2725,7 +3196,7 @@ where let peer = req.get_transfer_leader().get_peer(); // Only execute TransferLeader if the expected new leader is self. - if peer.get_id() == self.id { + if peer.get_id() == self.id() { Ok((resp, ApplyResult::Res(ExecResult::TransferLeader { term }))) } else { Ok((resp, ApplyResult::None)) @@ -2740,16 +3211,20 @@ where let resp = AdminResponse::default(); Ok(( resp, - ApplyResult::Res(ExecResult::ComputeHash { - region: self.region.clone(), - index: ctx.exec_log_index, - context: req.get_compute_hash().get_context().to_vec(), - // This snapshot may be held for a long time, which may cause too many - // open files in rocksdb. - // TODO: figure out another way to do consistency check without snapshot - // or short life snapshot. - snap: ctx.engine.snapshot(), - }), + if self.peer.is_witness { + ApplyResult::None + } else { + ApplyResult::Res(ExecResult::ComputeHash { + region: self.region.clone(), + index: ctx.exec_log_index, + context: req.get_compute_hash().get_context().to_vec(), + // This snapshot may be held for a long time, which may cause too many + // open files in rocksdb. + // TODO: figure out another way to do consistency check without snapshot + // or short life snapshot. + snap: ctx.engine.snapshot(), + }) + }, )) } @@ -2758,11 +3233,14 @@ where _: &ApplyContext, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { + let resp = AdminResponse::default(); + if self.peer.is_witness { + return Ok((resp, ApplyResult::None)); + } let verify_req = req.get_verify_hash(); let index = verify_req.get_index(); let context = verify_req.get_context().to_vec(); let hash = verify_req.get_hash().to_vec(); - let resp = AdminResponse::default(); Ok(( resp, ApplyResult::Res(ExecResult::VerifyHash { @@ -2773,20 +3251,109 @@ where )) } - fn update_memory_trace(&mut self, event: &mut TraceEvent) { - let pending_cmds = self.pending_cmds.heap_size(); - let merge_yield = if let Some(ref mut state) = self.yield_state { - if state.heap_size.is_none() { - state.heap_size = Some(state.heap_size()); - } - state.heap_size.unwrap() - } else { - 0 - }; - - let task = ApplyMemoryTrace { - pending_cmds, - merge_yield, + fn exec_batch_switch_witness( + &mut self, + ctx: &mut ApplyContext, + request: &AdminRequest, + ) -> Result<(AdminResponse, ApplyResult)> { + fail_point!( + "before_exec_batch_switch_witness", + self.id() == 2, + |_| unimplemented!() + ); + assert!(request.has_switch_witnesses()); + let switches = request + .get_switch_witnesses() + .get_switch_witnesses() + .to_vec(); + + info!( + "exec BatchSwitchWitness"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "epoch" => ?self.region.get_region_epoch(), + ); + + let mut region = self.region.clone(); + for s in switches.as_slice() { + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.all.inc(); + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + let mut peer_is_exist = false; + for p in region.mut_peers().iter_mut() { + if p.id == peer_id { + if p.is_witness == is_witness { + return Err(box_err!( + "switch peer {:?} on region {:?} is no-op", + p, + self.region + )); + } + p.is_witness = is_witness; + peer_is_exist = true; + break; + } + } + if !peer_is_exist { + return Err(box_err!( + "switch peer {} on region {:?} failed: peer does not exist", + peer_id, + self.region + )); + } + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.success.inc(); + if self.id() == peer_id && !is_witness { + self.wait_data = true; + self.peer.is_witness = false; + } + } + let conf_ver = region.get_region_epoch().get_conf_ver() + switches.len() as u64; + region.mut_region_epoch().set_conf_ver(conf_ver); + info!( + "switch witness successfully"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "switches" => ?switches, + "original region" => ?&self.region, + "current region" => ?®ion, + ); + + let state = if self.pending_remove { + PeerState::Tombstone + } else if self.wait_data { + PeerState::Unavailable + } else { + PeerState::Normal + }; + + if let Err(e) = write_peer_state(ctx.kv_wb_mut(), ®ion, state, None) { + panic!("{} failed to update region state: {:?}", self.tag, e); + } + + let resp = AdminResponse::default(); + Ok(( + resp, + ApplyResult::Res(ExecResult::BatchSwitchWitness(SwitchWitness { + index: ctx.exec_log_index, + switches, + region, + })), + )) + } + + fn update_memory_trace(&mut self, event: &mut TraceEvent) { + let pending_cmds = self.pending_cmds.heap_size(); + let merge_yield = if let Some(ref mut state) = self.yield_state { + if state.heap_size.is_none() { + state.heap_size = Some(state.heap_size()); + } + state.heap_size.unwrap() + } else { + 0 + }; + + let task = ApplyMemoryTrace { + pending_cmds, + merge_yield, }; if let Some(e) = self.trace.reset(task) { *event = *event + e; @@ -2802,7 +3369,10 @@ pub fn is_conf_change_cmd(msg: &RaftCmdRequest) -> bool { req.has_change_peer() || req.has_change_peer_v2() } -fn check_sst_for_ingestion(sst: &SstMeta, region: &Region) -> Result<()> { +/// This function is used to check whether an sst is valid for ingestion. +/// +/// The `sst` must have epoch and range matched with `region`. +pub fn check_sst_for_ingestion(sst: &SstMeta, region: &Region) -> Result<()> { let uuid = sst.get_uuid(); if let Err(e) = UuidBuilder::from_slice(uuid) { return Err(box_err!("invalid uuid {:?}: {:?}", uuid, e)); @@ -2863,10 +3433,7 @@ pub fn compact_raft_log( Ok(()) } -pub struct Apply -where - S: Snapshot, -{ +pub struct Apply { pub peer_id: u64, pub region_id: u64, pub term: u64, @@ -2874,11 +3441,11 @@ where pub commit_term: u64, pub entries: SmallVec<[CachedEntries; 1]>, pub entries_size: usize, - pub cbs: Vec>, + pub cbs: Vec>, pub bucket_meta: Option>, } -impl Apply { +impl Apply { pub(crate) fn new( peer_id: u64, region_id: u64, @@ -2886,9 +3453,9 @@ impl Apply { commit_index: u64, commit_term: u64, entries: Vec, - cbs: Vec>, + cbs: Vec>, buckets: Option>, - ) -> Apply { + ) -> Apply { let mut entries_size = 0; for e in &entries { entries_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); @@ -2908,23 +3475,19 @@ impl Apply { } pub fn on_schedule(&mut self, metrics: &RaftMetrics) { - let mut now = None; + let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Callback::Write { request_times, .. } = &mut cb.cb { - if now.is_none() { - now = Some(Instant::now()); - } - for t in request_times { - metrics - .store_time - .observe(duration_to_sec(now.unwrap().saturating_duration_since(*t))); - *t = now.unwrap(); - } + for tracker in cb.cb.write_trackers_mut() { + tracker.observe(now, &metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + tracker.reset(now); } } } - fn try_batch(&mut self, other: &mut Apply) -> bool { + fn try_batch(&mut self, other: &mut Apply) -> bool { assert_eq!(self.region_id, other.region_id); assert_eq!(self.peer_id, other.peer_id); if self.entries_size + other.entries_size <= MAX_APPLY_BATCH_SIZE { @@ -2955,7 +3518,7 @@ pub struct Registration { pub id: u64, pub term: u64, pub apply_state: RaftApplyState, - pub applied_index_term: u64, + pub applied_term: u64, pub region: Region, pub pending_request_snapshot_count: Arc, pub is_merging: bool, @@ -2968,7 +3531,7 @@ impl Registration { id: peer.peer_id(), term: peer.term(), apply_state: peer.get_store().apply_state().clone(), - applied_index_term: peer.get_store().applied_index_term(), + applied_term: peer.get_store().applied_term(), region: peer.region().clone(), pending_request_snapshot_count: peer.pending_request_snapshot_count.clone(), is_merging: peer.pending_merge_state.is_some(), @@ -2978,28 +3541,41 @@ impl Registration { } #[derive(Debug)] -pub struct Proposal -where - S: Snapshot, -{ +pub struct Proposal { pub is_conf_change: bool, pub index: u64, pub term: u64, - pub cb: Callback, - /// `propose_time` is set to the last time when a peer starts to renew lease. + pub cb: C, + /// `propose_time` is set to the last time when a peer starts to renew + /// lease. pub propose_time: Option, pub must_pass_epoch_check: bool, + pub sent: bool, } -impl HeapSize for Proposal {} +impl Proposal { + pub fn new(index: u64, term: u64, cb: C) -> Self { + Self { + index, + term, + cb, + propose_time: None, + must_pass_epoch_check: false, + is_conf_change: false, + sent: false, + } + } +} + +impl HeapSize for Proposal {} pub struct Destroy { region_id: u64, merge_from_snapshot: bool, } -/// A message that asks the delegate to apply to the given logs and then reply to -/// target mailbox. +/// A message that asks the delegate to apply to the given logs and then reply +/// to target mailbox. #[derive(Default, Debug)] pub struct CatchUpLogs { /// The target region to be notified when given logs are applied. @@ -3054,7 +3630,7 @@ impl GenSnapTask { pub fn generate_and_schedule_snapshot( self, kv_snap: EK::Snapshot, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, region_sched: &Scheduler>, ) -> Result<()> @@ -3067,7 +3643,7 @@ impl GenSnapTask { region_id: self.region_id, notifier: self.snap_notifier, for_balance: self.for_balance, - last_applied_index_term, + last_applied_term, last_applied_state, canceled: self.canceled, // This snapshot may be held for a long time, which may cause too many @@ -3089,14 +3665,14 @@ impl Debug for GenSnapTask { } #[derive(Debug)] -enum ObserverType { +pub enum ObserverType { Cdc(ObserveHandle), Rts(ObserveHandle), Pitr(ObserveHandle), } impl ObserverType { - fn handle(&self) -> &ObserveHandle { + pub fn handle(&self) -> &ObserveHandle { match self { ObserverType::Cdc(h) => h, ObserverType::Rts(h) => h, @@ -3107,8 +3683,8 @@ impl ObserverType { #[derive(Debug)] pub struct ChangeObserver { - ty: ObserverType, - region_id: u64, + pub ty: ObserverType, + pub region_id: u64, } impl ChangeObserver { @@ -3140,7 +3716,7 @@ where { Apply { start: Instant, - apply: Apply, + apply: Apply>, }, Registration(Registration), LogsUpToDate(CatchUpLogs), @@ -3155,13 +3731,47 @@ where #[cfg(any(test, feature = "testexport"))] #[allow(clippy::type_complexity)] Validate(u64, Box), + Recover(u64), + CheckCompact { + region_id: u64, + voter_replicated_index: u64, + voter_replicated_term: u64, + }, +} + +impl ResourceMetered for Msg { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { + match self { + Msg::Apply { apply, .. } => { + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; + for cached_entries in &apply.entries { + cached_entries.iter_entries(|entry| { + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); + let write_bytes = entry.compute_size() as u64; + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } + }); + } + Some(dominant_group) + } + _ => None, + } + } } impl Msg where EK: KvEngine, { - pub fn apply(apply: Apply) -> Msg { + pub fn apply(apply: Apply>) -> Msg { Msg::Apply { start: Instant::now(), apply, @@ -3202,6 +3812,18 @@ where } => write!(f, "[region {}] change cmd", region_id), #[cfg(any(test, feature = "testexport"))] Msg::Validate(region_id, _) => write!(f, "[region {}] validate", region_id), + Msg::Recover(region_id) => write!(f, "recover [region {}] apply", region_id), + Msg::CheckCompact { + region_id, + voter_replicated_index, + voter_replicated_term, + } => { + write!( + f, + "[region {}] check compact, voter_replicated_index: {}, voter_replicated_term: {}", + region_id, voter_replicated_index, voter_replicated_term + ) + } } } } @@ -3225,10 +3847,11 @@ where { pub region_id: u64, pub apply_state: RaftApplyState, - pub applied_index_term: u64, + pub applied_term: u64, pub exec_res: VecDeque>, pub metrics: ApplyMetrics, pub bucket_stat: Option>, + pub write_seqno: Vec, } #[derive(Debug)] @@ -3280,7 +3903,8 @@ where ) } - /// Handles peer registration. When a peer is created, it will register an apply delegate. + /// Handles peer registration. When a peer is created, it will register an + /// apply delegate. fn handle_registration(&mut self, reg: Registration) { info!( "re-register to apply delegates"; @@ -3288,14 +3912,19 @@ where "peer_id" => self.delegate.id(), "term" => reg.term ); - assert_eq!(self.delegate.id, reg.id); + assert_eq!(self.delegate.id(), reg.id); self.delegate.term = reg.term; self.delegate.clear_all_commands_as_stale(); self.delegate = ApplyDelegate::from_registration(reg); } - /// Handles apply tasks, and uses the apply delegate to handle the committed entries. - fn handle_apply(&mut self, apply_ctx: &mut ApplyContext, mut apply: Apply) { + /// Handles apply tasks, and uses the apply delegate to handle the committed + /// entries. + fn handle_apply( + &mut self, + apply_ctx: &mut ApplyContext, + mut apply: Apply>, + ) { if apply_ctx.timer.is_none() { apply_ctx.timer = Some(Instant::now_coarse()); } @@ -3309,6 +3938,10 @@ where return; } + if self.delegate.wait_data { + return; + } + let mut entries = Vec::new(); let mut dangle_size = 0; @@ -3333,15 +3966,14 @@ where RAFT_ENTRIES_CACHES_GAUGE.sub(dangle_size as i64); } - self.delegate.metrics = ApplyMetrics::default(); self.delegate.term = apply.term; if let Some(meta) = apply.bucket_meta.clone() { - let buckets = self - .delegate - .buckets - .get_or_insert_with(BucketStat::default); - buckets.stats = new_bucket_stats(&meta); - buckets.meta = meta; + if let Some(old) = &mut self.delegate.buckets { + old.set_meta(meta); + } else { + let new = BucketStat::from_meta(meta); + self.delegate.buckets.replace(new); + } } let prev_state = ( @@ -3369,12 +4001,12 @@ where } /// Handles proposals, and appends the commands to the apply delegate. - fn append_proposal(&mut self, props_drainer: Drain<'_, Proposal>) { + fn append_proposal(&mut self, props_drainer: Drain<'_, Proposal>>) { let (region_id, peer_id) = (self.delegate.region_id(), self.delegate.id()); let propose_num = props_drainer.len(); if self.delegate.stopped { for p in props_drainer { - let cmd = PendingCmd::::new(p.index, p.term, p.cb); + let cmd = PendingCmd::new(p.index, p.term, p.cb); notify_stale_command(region_id, peer_id, self.delegate.term, cmd); } return; @@ -3417,7 +4049,8 @@ where self.delegate.destroy(ctx); } - /// Handles peer destroy. When a peer is destroyed, the corresponding apply delegate should be removed too. + /// Handles peer destroy. When a peer is destroyed, the corresponding apply + /// delegate should be removed too. fn handle_destroy(&mut self, ctx: &mut ApplyContext, d: Destroy) { assert_eq!(d.region_id, self.delegate.region_id()); if d.merge_from_snapshot { @@ -3430,7 +4063,7 @@ where PeerMsg::ApplyRes { res: TaskRes::Destroy { region_id: self.delegate.region_id(), - peer_id: self.delegate.id, + peer_id: self.delegate.id(), merge_from_snapshot: d.merge_from_snapshot, }, }, @@ -3488,8 +4121,9 @@ where "region_id" => region_id, "peer_id" => self.delegate.id(), ); - // The source peer fsm will be destroyed when the target peer executes `on_ready_commit_merge` - // and sends `merge result` to the source peer fsm. + // The source peer fsm will be destroyed when the target peer executes + // `on_ready_commit_merge` and sends `merge result` to the source peer + // fsm. self.destroy(ctx); catch_up_logs .logs_up_to_date @@ -3506,26 +4140,33 @@ where } } - #[allow(unused_mut, clippy::redundant_closure_call)] fn handle_snapshot(&mut self, apply_ctx: &mut ApplyContext, snap_task: GenSnapTask) { if self.delegate.pending_remove || self.delegate.stopped { return; } + if self.delegate.peer.is_witness || self.delegate.wait_data { + // witness or non-witness hasn't finish applying snapshot shouldn't generate + // snapshot. + return; + } let applied_index = self.delegate.apply_state.get_applied_index(); - let mut need_sync = apply_ctx + let need_sync = apply_ctx .apply_res .iter() .any(|res| res.region_id == self.delegate.region_id()) && self.delegate.last_flush_applied_index != applied_index; - (|| fail_point!("apply_on_handle_snapshot_sync", |_| { need_sync = true }))(); - if need_sync { + let force_sync_fp = || { + fail_point!("apply_on_handle_snapshot_sync", |_| true); + false + }; + if need_sync || force_sync_fp() { if apply_ctx.timer.is_none() { apply_ctx.timer = Some(Instant::now_coarse()); } - self.delegate.write_apply_state(apply_ctx.kv_wb_mut()); + self.delegate.maybe_write_apply_state(apply_ctx); fail_point!( "apply_on_handle_snapshot_1_1", - self.delegate.id == 1 && self.delegate.region_id() == 1, + self.delegate.id() == 1 && self.delegate.region_id() == 1, |_| unimplemented!() ); @@ -3535,7 +4176,7 @@ where if let Err(e) = snap_task.generate_and_schedule_snapshot::( apply_ctx.engine.snapshot(), - self.delegate.applied_index_term, + self.delegate.applied_term, self.delegate.apply_state.clone(), &apply_ctx.region_scheduler, ) { @@ -3551,7 +4192,7 @@ where .fetch_sub(1, Ordering::SeqCst); fail_point!( "apply_on_handle_snapshot_finish_1_1", - self.delegate.id == 1 && self.delegate.region_id() == 1, + self.delegate.id() == 1 && self.delegate.region_id() == 1, |_| unimplemented!() ); } @@ -3593,12 +4234,13 @@ where let resp = match compare_region_epoch( ®ion_epoch, &self.delegate.region, - false, /* check_conf_ver */ - true, /* check_ver */ - true, /* include_region */ + false, // check_conf_ver + true, // check_ver + true, // include_region ) { Ok(()) => { - // Commit the writebatch for ensuring the following snapshot can get all previous writes. + // Commit the writebatch for ensuring the following snapshot can get all + // previous writes. if apply_ctx.kv_wb().count() > 0 { apply_ctx.commit(&mut self.delegate); } @@ -3636,6 +4278,45 @@ where cb.invoke_read(resp); } + fn check_pending_compact_log( + &mut self, + ctx: &mut ApplyContext, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) { + if self.delegate.pending_remove || self.delegate.stopped { + return; + } + + let res = self + .delegate + .try_compact_log(voter_replicated_index, voter_replicated_term); + match res { + Ok((should_write, res)) => { + if let Some(res) = res { + if ctx.timer.is_none() { + ctx.timer = Some(Instant::now_coarse()); + } + ctx.prepare_for(&mut self.delegate); + let mut result = VecDeque::new(); + // If modified `truncated_state` in `try_compact_log`, the apply state should be + // persisted. + if should_write { + self.delegate.write_apply_state(ctx.kv_wb_mut()); + ctx.commit_opt(&mut self.delegate, true); + } + result.push_back(res); + ctx.finish_for(&mut self.delegate, result); + } + } + Err(e) => error!(?e; + "failed to compact log"; + "region_id" => self.delegate.region.get_id(), + "peer_id" => self.delegate.id(), + ), + } + } + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; @@ -3666,9 +4347,18 @@ where match msg { Msg::Apply { start, mut apply } => { - apply_ctx - .apply_wait - .observe(start.saturating_elapsed_secs()); + let apply_wait = start.saturating_elapsed(); + apply_ctx.apply_wait.observe(apply_wait.as_secs_f64()); + for tracker in apply + .cbs + .iter() + .flat_map(|p| p.cb.write_trackers()) + .flat_map(|ts| ts.as_tracker_token()) + { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { + t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; + }); + } if let Some(batch) = batch_apply.as_mut() { if batch.try_batch(&mut apply) { @@ -3682,8 +4372,11 @@ where } } } - batch_apply = Some(apply); + if !self.delegate.wait_data { + batch_apply = Some(apply); + } } + Msg::Recover(..) => self.delegate.wait_data = false, Msg::Registration(reg) => self.handle_registration(reg), Msg::Destroy(d) => self.handle_destroy(apply_ctx, d), Msg::LogsUpToDate(cul) => self.logs_up_to_date_for_merge(apply_ctx, cul), @@ -3696,9 +4389,20 @@ where } => self.handle_change(apply_ctx, cmd, region_epoch, cb), #[cfg(any(test, feature = "testexport"))] Msg::Validate(_, f) => { - let delegate: *const u8 = unsafe { mem::transmute(&self.delegate) }; + let delegate = &self.delegate as *const ApplyDelegate as *const u8; f(delegate) } + Msg::CheckCompact { + voter_replicated_index, + voter_replicated_term, + .. + } => { + self.check_pending_compact_log( + apply_ctx, + voter_replicated_index, + voter_replicated_term, + ); + } } } } @@ -3760,13 +4464,15 @@ pub enum ControlMsg { }, } +impl ResourceMetered for ControlMsg {} + pub struct ControlFsm { receiver: Receiver, stopped: bool, } impl ControlFsm { - fn new() -> (LooseBoundedSender, Box) { + pub fn new() -> (LooseBoundedSender, Box) { let (tx, rx) = loose_bounded(std::usize::MAX); let fsm = Box::new(ControlFsm { stopped: false, @@ -3774,6 +4480,28 @@ impl ControlFsm { }); (tx, fsm) } + + pub fn handle_messages(&mut self, pending_latency_inspect: &mut Vec) { + // Usually there will be only 1 control message. + loop { + match self.receiver.try_recv() { + Ok(ControlMsg::LatencyInspect { + send_time, + mut inspector, + }) => { + inspector.record_apply_wait(send_time.saturating_elapsed()); + pending_latency_inspect.push(inspector); + } + Err(TryRecvError::Empty) => { + return; + } + Err(TryRecvError::Disconnected) => { + self.stopped = true; + return; + } + } + } + } } impl Fsm for ControlFsm { @@ -3817,33 +4545,17 @@ where } _ => {} } + self.apply_ctx.yield_msg_size = incoming.apply_yield_write_size.0; update_cfg(&incoming.apply_batch_system); } - self.apply_ctx.perf_context.start_observe(); } fn handle_control(&mut self, control: &mut ControlFsm) -> Option { - loop { - match control.receiver.try_recv() { - Ok(ControlMsg::LatencyInspect { - send_time, - mut inspector, - }) => { - if self.apply_ctx.timer.is_none() { - self.apply_ctx.timer = Some(Instant::now_coarse()); - } - inspector.record_apply_wait(send_time.saturating_elapsed()); - self.apply_ctx.pending_latency_inspect.push(inspector); - } - Err(TryRecvError::Empty) => { - return Some(0); - } - Err(TryRecvError::Disconnected) => { - control.stopped = true; - return Some(0); - } - } + control.handle_messages(&mut self.apply_ctx.pending_latency_inspect); + if !self.apply_ctx.pending_latency_inspect.is_empty() && self.apply_ctx.timer.is_none() { + self.apply_ctx.timer = Some(Instant::now_coarse()); } + Some(0) } fn handle_normal(&mut self, normal: &mut impl DerefMut>) -> HandleResult { @@ -4058,7 +4770,7 @@ where // So only shutdown needs to be checked here. if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { for p in apply.cbs.drain(..) { - let cmd = PendingCmd::::new(p.index, p.term, p.cb); + let cmd = PendingCmd::new(p.index, p.term, p.cb); notify_region_removed(apply.region_id, apply.peer_id, cmd); } } @@ -4103,6 +4815,16 @@ where } #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => return, + Msg::Recover(region_id) => { + info!("recover apply"; + "region_id" => region_id); + return; + } + Msg::CheckCompact { region_id, .. } => { + info!("target region is not found"; + "region_id" => region_id); + return; + } }, Either::Left(Err(TrySendError::Full(_))) => unreachable!(), }; @@ -4172,10 +4894,15 @@ impl ApplyBatchSystem { pub fn create_apply_batch_system( cfg: &Config, + resource_ctl: Option>, ) -> (ApplyRouter, ApplyBatchSystem) { let (control_tx, control_fsm) = ControlFsm::new(); - let (router, system) = - batch_system::create_system(&cfg.apply_batch_system, control_tx, control_fsm); + let (router, system) = batch_system::create_system( + &cfg.apply_batch_system, + control_tx, + control_fsm, + resource_ctl, + ); (ApplyRouter { router }, ApplyBatchSystem { system }) } @@ -4190,14 +4917,11 @@ mod memtrace { pub merge_yield: usize, } - impl HeapSize for PendingCmdQueue - where - S: Snapshot, - { + impl HeapSize for PendingCmdQueue { fn heap_size(&self) -> usize { - // Some fields of `PendingCmd` are on stack, but ignore them because they are just - // some small boxed closures. - self.normals.capacity() * mem::size_of::>() + // Some fields of `PendingCmd` are on stack, but ignore them because they are + // just some small boxed closures. + self.normals.capacity() * mem::size_of::>() } } @@ -4238,6 +4962,8 @@ mod memtrace { | Msg::Change { .. } => 0, #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => 0, + Msg::Recover(..) => 0, + Msg::CheckCompact { .. } => 0, } } } @@ -4263,19 +4989,26 @@ mod tests { time::*, }; + use bytes::Bytes; use engine_panic::PanicEngine; use engine_test::kv::{new_engine, KvTestEngine, KvTestSnapshot}; - use engine_traits::{Peekable as PeekableTrait, WriteBatchExt}; + use engine_traits::{Peekable as PeekableTrait, SyncMutable, WriteBatchExt}; use kvproto::{ kvrpcpb::ApiVersion, metapb::{self, RegionEpoch}, raft_cmdpb::*, }; use protobuf::Message; + use raft::eraftpb::{ConfChange, ConfChangeV2}; use sst_importer::Config as ImportConfig; use tempfile::{Builder, TempDir}; use test_sst_importer::*; - use tikv_util::{config::VersionTrack, worker::dummy_scheduler}; + use tikv_util::{ + config::{ReadableSize, VersionTrack}, + store::{new_learner_peer, new_peer}, + worker::dummy_scheduler, + }; + use txn_types::WriteBatchFlags; use uuid::Uuid; use super::*; @@ -4284,7 +5017,7 @@ mod tests { store::{ msg::WriteResponse, peer_storage::RAFT_INIT_LOG_INDEX, - util::{new_learner_peer, new_peer}, + simple_write::{SimpleWriteEncoder, SimpleWriteReqEncoder}, Config, RegionTask, }, }; @@ -4299,13 +5032,7 @@ mod tests { pub fn create_tmp_engine(path: &str) -> (TempDir, KvTestEngine) { let path = Builder::new().prefix(path).tempdir().unwrap(); - let engine = new_engine( - path.path().join("db").to_str().unwrap(), - None, - ALL_CFS, - None, - ) - .unwrap(); + let engine = new_engine(path.path().join("db").to_str().unwrap(), ALL_CFS).unwrap(); (path, engine) } @@ -4327,6 +5054,7 @@ mod tests { cmd.mut_put().set_key(b"key".to_vec()); cmd.mut_put().set_value(b"value".to_vec()); let mut req = RaftCmdRequest::default(); + req.set_header(RaftRequestHeader::default()); req.mut_requests().push(cmd); e.set_data(req.write_to_bytes().unwrap().into()) } @@ -4359,7 +5087,7 @@ mod tests { id: Default::default(), term: Default::default(), apply_state: Default::default(), - applied_index_term: Default::default(), + applied_term: Default::default(), region: Default::default(), pending_request_snapshot_count: Default::default(), is_merging: Default::default(), @@ -4374,7 +5102,7 @@ mod tests { id: self.id, term: self.term, apply_state: self.apply_state.clone(), - applied_index_term: self.applied_index_term, + applied_term: self.applied_term, region: self.region.clone(), pending_request_snapshot_count: self.pending_request_snapshot_count.clone(), is_merging: self.is_merging, @@ -4383,6 +5111,42 @@ mod tests { } } + #[test] + fn test_can_witness_skip() { + let mut entry = Entry::new(); + let mut req = RaftCmdRequest::default(); + entry.set_entry_type(EntryType::EntryNormal); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + req.mut_admin_request() + .set_cmd_type(AdminCmdType::CompactLog); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + let mut req = RaftCmdRequest::default(); + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + req.set_requests(vec![request].into()); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChange); + let conf_change = ConfChange::new(); + let data = conf_change.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChangeV2); + let conf_change_v2 = ConfChangeV2::new(); + let data = conf_change_v2.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + } + #[test] fn test_should_sync_log() { // Admin command @@ -4515,7 +5279,7 @@ mod tests { index: u64, term: u64, cb: Callback, - ) -> Proposal { + ) -> Proposal> { Proposal { is_conf_change, index, @@ -4523,16 +5287,17 @@ mod tests { cb, propose_time: None, must_pass_epoch_check: false, + sent: true, } } - fn apply( + fn apply( peer_id: u64, region_id: u64, term: u64, entries: Vec, - cbs: Vec>, - ) -> Apply { + cbs: Vec>, + ) -> Apply { let (commit_index, commit_term) = entries .last() .map(|e| (e.get_index(), e.get_term())) @@ -4557,7 +5322,7 @@ mod tests { let (_dir, importer) = create_tmp_importer("apply-basic"); let (region_scheduler, mut snapshot_rx) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -4576,20 +5341,24 @@ mod tests { let mut reg = Registration { id: 1, term: 4, - applied_index_term: 5, + applied_term: 5, ..Default::default() }; reg.region.set_id(2); + let mut peer = metapb::Peer::default(); + peer.set_id(1); + reg.region.mut_peers().push(peer.clone()); reg.apply_state.set_applied_index(3); router.schedule_task(2, Msg::Registration(reg.dup())); validate(&router, 2, move |delegate| { - assert_eq!(delegate.id, 1); + assert_eq!(delegate.id(), 1); + assert_eq!(delegate.peer, peer); assert_eq!(delegate.tag, "[region 2] 1"); assert_eq!(delegate.region, reg.region); assert!(!delegate.pending_remove); assert_eq!(delegate.apply_state, reg.apply_state); assert_eq!(delegate.term, reg.term); - assert_eq!(delegate.applied_index_term, reg.applied_index_term); + assert_eq!(delegate.applied_term, reg.applied_term); }); let (resp_tx, resp_rx) = mpsc::channel(); @@ -4608,7 +5377,7 @@ mod tests { // unregistered region should be ignored and notify failed. let resp = resp_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_region_not_found()); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); let (cc_tx, cc_rx) = mpsc::channel(); let pops = vec![ @@ -4632,7 +5401,7 @@ mod tests { }); let cc_resp = cc_rx.try_recv().unwrap(); assert!(cc_resp.get_header().get_error().has_stale_command()); - assert!(rx.recv_timeout(Duration::from_secs(3)).is_ok()); + rx.recv_timeout(Duration::from_secs(3)).unwrap(); // Make sure Apply and Snapshot are in the same batch. let (snap_tx, _) = mpsc::sync_channel(0); @@ -4663,12 +5432,13 @@ mod tests { assert_eq!(apply_res.apply_state, apply_state); assert_eq!(apply_res.apply_state.get_applied_index(), 5); assert!(apply_res.exec_res.is_empty()); - // empty entry will make applied_index step forward and should write apply state to engine. + // empty entry will make applied_index step forward and should write apply state + // to engine. assert_eq!(apply_res.metrics.written_keys, 1); - assert_eq!(apply_res.applied_index_term, 5); + assert_eq!(apply_res.applied_term, 5); validate(&router, 2, |delegate| { assert_eq!(delegate.term, 11); - assert_eq!(delegate.applied_index_term, 5); + assert_eq!(delegate.applied_term, 5); assert_eq!(delegate.apply_state.get_applied_index(), 5); assert_eq!( delegate.apply_state.get_applied_index(), @@ -4710,12 +5480,12 @@ mod tests { "{:?}", resp ); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); system.shutdown(); } - fn cb(idx: u64, term: u64, tx: Sender) -> Proposal { + fn cb(idx: u64, term: u64, tx: Sender) -> Proposal> { proposal( false, idx, @@ -4828,6 +5598,31 @@ mod tests { self } + fn prepare_merge(mut self, target: metapb::Region) -> EntryBuilder { + let mut request = AdminRequest::default(); + request.set_cmd_type(AdminCmdType::PrepareMerge); + request.mut_prepare_merge().set_target(target); + self.req.set_admin_request(request); + self + } + + fn compact_log(mut self, index: u64, term: u64) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::CompactLog); + req.mut_compact_log().set_compact_index(index); + req.mut_compact_log().set_compact_term(term); + self.req.set_admin_request(req); + self + } + + fn compute_hash(mut self, context: Vec) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ComputeHash); + req.mut_compute_hash().set_context(context); + self.req.set_admin_request(req); + self + } + fn build(mut self) -> Entry { self.entry .set_data(self.req.write_to_bytes().unwrap().into()); @@ -4835,11 +5630,105 @@ mod tests { } } + struct EntryBuilderUsingSimpleWrite { + entry: Entry, + header: Box, + encoder: SimpleWriteEncoder, + } + + impl EntryBuilderUsingSimpleWrite { + fn new(index: u64, term: u64) -> EntryBuilderUsingSimpleWrite { + let encoder = SimpleWriteEncoder::with_capacity(64); + let header = Box::::default(); + let mut entry = Entry::default(); + entry.set_index(index); + entry.set_term(term); + EntryBuilderUsingSimpleWrite { + entry, + header, + encoder, + } + } + + fn epoch(mut self, conf_ver: u64, version: u64) -> EntryBuilderUsingSimpleWrite { + let mut epoch = RegionEpoch::default(); + epoch.set_version(version); + epoch.set_conf_ver(conf_ver); + self.header.set_region_epoch(epoch); + self + } + + fn put(mut self, key: &[u8], value: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.put(CF_DEFAULT, key, value); + self + } + + fn put_cf(mut self, cf: &str, key: &[u8], value: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.put(cf, key, value); + self + } + + fn delete(mut self, key: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.delete(CF_DEFAULT, key); + self + } + + fn delete_cf(mut self, cf: &str, key: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.delete(cf, key); + self + } + + fn delete_range( + mut self, + start_key: &[u8], + end_key: &[u8], + ) -> EntryBuilderUsingSimpleWrite { + self.encoder + .delete_range(CF_DEFAULT, start_key, end_key, false); + self + } + + fn delete_range_cf( + mut self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> EntryBuilderUsingSimpleWrite { + self.encoder.delete_range(cf, start_key, end_key, false); + self + } + + fn ingest_sst(mut self, meta: &SstMeta) -> EntryBuilderUsingSimpleWrite { + self.encoder.ingest(vec![meta.clone()]); + self + } + + fn build(mut self) -> Entry { + let bin = self.encoder.encode(); + let req_encoder = SimpleWriteReqEncoder::>::new( + self.header.clone(), + bin, + 1000, + false, + ); + let (bytes, _) = req_encoder.encode(); + self.entry.set_data(bytes.into()); + self.entry + } + } + #[derive(Clone, Default)] struct ApplyObserver { pre_query_count: Arc, post_query_count: Arc, cmd_sink: Option>>>, + filter_compact_log: Arc, + filter_consistency_check: Arc, + skip_persist_when_pre_commit: Arc, + delay_remove_ssts: Arc, + last_delete_sst_count: Arc, + last_pending_delete_sst_count: Arc, + last_pending_handle_sst_count: Arc, } impl Coprocessor for ApplyObserver {} @@ -4852,6 +5741,89 @@ mod tests { fn post_apply_query(&self, _: &mut ObserverContext<'_>, _: &Cmd) { self.post_query_count.fetch_add(1, Ordering::SeqCst); } + + fn post_exec_query( + &self, + _: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + apply_info: &mut ApplyCtxInfo<'_>, + ) -> bool { + match apply_info.pending_handle_ssts { + Some(v) => { + // If it is a ingest sst + let mut ssts = std::mem::take(v); + assert_ne!(ssts.len(), 0); + if self.delay_remove_ssts.load(Ordering::SeqCst) { + apply_info.pending_delete_ssts.append(&mut ssts); + } else { + apply_info.delete_ssts.append(&mut ssts); + } + } + None => (), + } + self.last_delete_sst_count + .store(apply_info.delete_ssts.len() as u64, Ordering::SeqCst); + self.last_pending_delete_sst_count.store( + apply_info.pending_delete_ssts.len() as u64, + Ordering::SeqCst, + ); + self.last_pending_handle_sst_count.store( + match apply_info.pending_handle_ssts { + Some(ref v) => v.len() as u64, + None => 0, + }, + Ordering::SeqCst, + ); + false + } + } + + impl AdminObserver for ApplyObserver { + fn post_exec_admin( + &self, + _: &mut ObserverContext<'_>, + cmd: &Cmd, + _: &RaftApplyState, + region_state: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + let request = cmd.request.get_admin_request(); + match request.get_cmd_type() { + AdminCmdType::CompactLog => true, + AdminCmdType::CommitMerge + | AdminCmdType::PrepareMerge + | AdminCmdType::RollbackMerge => { + assert!(region_state.modified_region.is_some()); + true + } + AdminCmdType::BatchSplit => true, + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => true, + _ => false, + } + } + + fn pre_exec_admin( + &self, + _: &mut ObserverContext<'_>, + req: &AdminRequest, + _: u64, + _: u64, + ) -> bool { + let cmd_type = req.get_cmd_type(); + if cmd_type == AdminCmdType::CompactLog + && self.filter_compact_log.deref().load(Ordering::SeqCst) + { + return true; + }; + if (cmd_type == AdminCmdType::ComputeHash || cmd_type == AdminCmdType::VerifyHash) + && self.filter_consistency_check.deref().load(Ordering::SeqCst) + { + return true; + }; + false + } } impl CmdObserver for ApplyObserver @@ -4877,6 +5849,17 @@ mod tests { fn on_applied_current_term(&self, _: raft::StateRole, _: &Region) {} } + impl RegionChangeObserver for ApplyObserver { + fn pre_persist( + &self, + _: &mut ObserverContext<'_>, + _is_finished: bool, + _cmd: Option<&RaftCmdRequest>, + ) -> bool { + !self.skip_persist_when_pre_commit.load(Ordering::SeqCst) + } + } + #[test] fn test_handle_raft_committed_entries() { let (_path, engine) = create_tmp_engine("test-delegate"); @@ -4890,7 +5873,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -4944,7 +5927,7 @@ mod tests { assert_eq!(engine.get_value(&dk_k2).unwrap().unwrap(), b"v1"); assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); validate(&router, 1, |delegate| { - assert_eq!(delegate.applied_index_term, 1); + assert_eq!(delegate.applied_term, 1); assert_eq!(delegate.apply_state.get_applied_index(), 1); }); fetch_apply_res(&rx); @@ -4957,7 +5940,7 @@ mod tests { let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.region_id, 1); assert_eq!(apply_res.apply_state.get_applied_index(), 2); - assert_eq!(apply_res.applied_index_term, 2); + assert_eq!(apply_res.applied_term, 2); assert!(apply_res.exec_res.is_empty()); assert!(apply_res.metrics.written_bytes >= 5); assert_eq!(apply_res.metrics.written_keys, 2); @@ -4985,7 +5968,7 @@ mod tests { let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_epoch_not_match()); let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 2); + assert_eq!(apply_res.applied_term, 2); assert_eq!(apply_res.apply_state.get_applied_index(), 3); let put_entry = EntryBuilder::new(4, 2) @@ -5006,7 +5989,7 @@ mod tests { let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_key_not_in_region()); let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 2); + assert_eq!(apply_res.applied_term, 2); assert_eq!(apply_res.apply_state.get_applied_index(), 4); // a writebatch should be atomic. assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); @@ -5100,7 +6083,7 @@ mod tests { assert!(apply_res.exec_res.is_empty()); // The entry should be applied now. let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 8); // UploadSST @@ -5177,15 +6160,360 @@ mod tests { // The region was rescheduled low-priority becasuee of ingest command, // only put entry has been applied; let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 9); + // The region will yield after timeout. + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 10); + // The third entry should be applied now. + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 11); + + let write_batch_max_keys = ::WRITE_BATCH_MAX_KEYS; + + let mut props = vec![]; + let mut entries = vec![]; + for i in 0..write_batch_max_keys { + let put_entry = EntryBuilder::new(i as u64 + 12, 3) + .put(b"k", b"v") + .epoch(1, 3) + .build(); + entries.push(put_entry); + props.push(cb(i as u64 + 12, 3, capture_tx.clone())); + } + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 3, entries, props))); + for _ in 0..write_batch_max_keys { + capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + } + let index = write_batch_max_keys + 11; + // The region was rescheduled to normal-priority handler. Discard the first + // apply_res. + fetch_apply_res(&rx); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index as u64); + assert_eq!(obs.pre_query_count.load(Ordering::SeqCst), index); + assert_eq!(obs.post_query_count.load(Ordering::SeqCst), index); + + system.shutdown(); + } + + #[test] + fn test_handle_raft_committed_entries_from_v2() { + let (_path, engine) = create_tmp_engine("test-delegate"); + let (import_dir, importer) = create_tmp_importer("test-delegate"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs.clone())); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let mut config = Config::default(); + config.enable_v2_compatible_learner = true; + let cfg = Arc::new(VersionTrack::new(config)); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + region_scheduler, + coprocessor_host: host, + importer: importer.clone(), + engine: engine.clone(), + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-handle-raft".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let (capture_tx, capture_rx) = mpsc::channel(); + let put_entry = EntryBuilderUsingSimpleWrite::new(1, 1) + .put(b"k1", b"v1") + .put(b"k2", b"v1") + .put(b"k3", b"v1") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![put_entry], + vec![cb(1, 1, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let dk_k1 = keys::data_key(b"k1"); + let dk_k2 = keys::data_key(b"k2"); + let dk_k3 = keys::data_key(b"k3"); + assert_eq!(engine.get_value(&dk_k1).unwrap().unwrap(), b"v1"); + assert_eq!(engine.get_value(&dk_k2).unwrap().unwrap(), b"v1"); + assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); + validate(&router, 1, |delegate| { + assert_eq!(delegate.applied_term, 1); + assert_eq!(delegate.apply_state.get_applied_index(), 1); + }); + fetch_apply_res(&rx); + + let put_entry = EntryBuilderUsingSimpleWrite::new(2, 2) + .put_cf(CF_LOCK, b"k1", b"v1") + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 2, vec![put_entry], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.region_id, 1); + assert_eq!(apply_res.apply_state.get_applied_index(), 2); + assert_eq!(apply_res.applied_term, 2); + assert!(apply_res.exec_res.is_empty()); + assert!(apply_res.metrics.written_bytes >= 5); + assert_eq!(apply_res.metrics.written_keys, 2); + assert_eq!(apply_res.metrics.size_diff_hint, 5); + assert_eq!(apply_res.metrics.lock_cf_written_bytes, 5); + assert_eq!( + engine.get_value_cf(CF_LOCK, &dk_k1).unwrap().unwrap(), + b"v1" + ); + + let put_entry = EntryBuilderUsingSimpleWrite::new(3, 2) + .put(b"k2", b"v2") + .epoch(1, 1) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 2, + vec![put_entry], + vec![cb(3, 2, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_epoch_not_match()); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 2); + assert_eq!(apply_res.apply_state.get_applied_index(), 3); + + let put_entry = EntryBuilderUsingSimpleWrite::new(4, 2) + .put(b"k3", b"v3") + .put(b"k5", b"v5") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 2, + vec![put_entry], + vec![cb(4, 2, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_key_not_in_region()); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 2); + assert_eq!(apply_res.apply_state.get_applied_index(), 4); + // a writebatch should be atomic. + assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); + + let put_entry = EntryBuilderUsingSimpleWrite::new(5, 3) + .delete(b"k1") + .delete_cf(CF_LOCK, b"k1") + .delete_cf(CF_WRITE, b"k1") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![put_entry], + vec![cb(5, 2, capture_tx.clone()), cb(5, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + // stale command should be cleared. + assert!(resp.get_header().get_error().has_stale_command()); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert!(engine.get_value(&dk_k1).unwrap().is_none()); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.metrics.lock_cf_written_bytes, 3); + assert_eq!(apply_res.metrics.delete_keys_hint, 2); + assert_eq!(apply_res.metrics.size_diff_hint, -9); + + let delete_entry = EntryBuilderUsingSimpleWrite::new(6, 3) + .delete(b"k5") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![delete_entry], + vec![cb(6, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_key_not_in_region()); + fetch_apply_res(&rx); + + let delete_range_entry = EntryBuilderUsingSimpleWrite::new(7, 3) + .delete_range(b"", b"") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![delete_range_entry], + vec![cb(7, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_key_not_in_region()); + assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); + fetch_apply_res(&rx); + + let delete_range_entry = EntryBuilderUsingSimpleWrite::new(8, 3) + .delete_range_cf(CF_DEFAULT, b"", b"k5") + .delete_range_cf(CF_LOCK, b"", b"k5") + .delete_range_cf(CF_WRITE, b"", b"k5") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![delete_range_entry], + vec![cb(8, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert!(engine.get_value(&dk_k1).unwrap().is_none()); + assert!(engine.get_value(&dk_k2).unwrap().is_none()); + assert!(engine.get_value(&dk_k3).unwrap().is_none()); + + // The region was rescheduled from normal-priority handler to + // low-priority handler, so the first apple_res.exec_res should be empty. + let apply_res = fetch_apply_res(&rx); + assert!(apply_res.exec_res.is_empty()); + // The entry should be applied now. + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 8); + + // UploadSST + let sst_path = import_dir.path().join("test.sst"); + let mut sst_epoch = RegionEpoch::default(); + sst_epoch.set_conf_ver(1); + sst_epoch.set_version(3); + let sst_range = (0, 100); + let (mut meta1, data1) = gen_sst_file(&sst_path, sst_range); + meta1.set_region_id(1); + meta1.set_region_epoch(sst_epoch); + let mut file1 = importer.create(&meta1).unwrap(); + file1.append(&data1).unwrap(); + file1.finish().unwrap(); + let (mut meta2, data2) = gen_sst_file(&sst_path, sst_range); + meta2.set_region_id(1); + meta2.mut_region_epoch().set_conf_ver(1); + meta2.mut_region_epoch().set_version(1234); + let mut file2 = importer.create(&meta2).unwrap(); + file2.append(&data2).unwrap(); + file2.finish().unwrap(); + + // IngestSst + let put_ok = EntryBuilderUsingSimpleWrite::new(9, 3) + .put(&[sst_range.0], &[sst_range.1]) + .epoch(0, 3) + .build(); + // Add a put above to test flush before ingestion. + let capture_tx_clone = capture_tx.clone(); + let ingest_ok = EntryBuilderUsingSimpleWrite::new(10, 3) + .ingest_sst(&meta1) + .epoch(0, 3) + .build(); + let ingest_epoch_not_match = EntryBuilderUsingSimpleWrite::new(11, 3) + .ingest_sst(&meta2) + .epoch(0, 3) + .build(); + let entries = vec![put_ok, ingest_ok, ingest_epoch_not_match]; + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + entries, + vec![ + cb(9, 3, capture_tx.clone()), + proposal( + false, + 10, + 3, + Callback::write(Box::new(move |resp: WriteResponse| { + // Sleep until yield timeout. + thread::sleep(Duration::from_millis(500)); + capture_tx_clone.send(resp.response).unwrap(); + })), + ), + cb(11, 3, capture_tx.clone()), + ], + )), + ); + + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + check_db_range(&engine, sst_range); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().has_error()); + + // The region was rescheduled to normal-priority handler because of + // nomral put command, so the first apple_res.exec_res should be empty. + let apply_res = fetch_apply_res(&rx); + assert!(apply_res.exec_res.is_empty()); + // The region was rescheduled low-priority becasuee of ingest command, + // only put entry has been applied; + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 9); // The region will yield after timeout. let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 10); // The third entry should be applied now. let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 11); let write_batch_max_keys = ::WRITE_BATCH_MAX_KEYS; @@ -5204,15 +6532,102 @@ mod tests { for _ in 0..write_batch_max_keys { capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); } - let index = write_batch_max_keys + 11; - // The region was rescheduled to normal-priority handler. Discard the first apply_res. - fetch_apply_res(&rx); + let index = write_batch_max_keys + 11; + // The region was rescheduled to normal-priority handler. Discard the first + // apply_res. + fetch_apply_res(&rx); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index as u64); + assert_eq!(obs.pre_query_count.load(Ordering::SeqCst), index); + assert_eq!(obs.post_query_count.load(Ordering::SeqCst), index); + + system.shutdown(); + } + + #[test] + fn test_apply_yield_with_msg_size() { + let (_path, engine) = create_tmp_engine("test-apply-yield"); + let (_import_dir, importer) = create_tmp_importer("test-apply-yield"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs)); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg: cfg.clone(), + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine, + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-handle-raft".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let schedule_apply = |idx: u64, count: usize, size: usize| { + let mut entries = Vec::with_capacity(count); + for i in 0..count { + let put_entry = EntryBuilder::new(idx + i as u64, 3) + .put(format!("k{:03}", i).as_ref(), &vec![0; size - 4]) + .epoch(1, 3) + .build(); + entries.push(put_entry); + } + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 3, entries, vec![]))); + }; + + fn approximate_eq(a: u64, b: u64, delta: u64) { + assert!( + a >= b - delta && a <= b + delta, + "left: {}, right: {}, delta: {}", + a, + b, + delta + ); + } + + // schedule a batch with 512 keys and 64k total size will trigger 2 flush and + // yield. + schedule_apply(1, 512, 128); let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.apply_state.get_applied_index(), index as u64); - assert_eq!(obs.pre_query_count.load(Ordering::SeqCst), index); - assert_eq!(obs.post_query_count.load(Ordering::SeqCst), index); + approximate_eq(apply_res.metrics.written_bytes, 32768, 2048); + approximate_eq(apply_res.metrics.written_keys, 256, 15); + // the second part, note that resume apply not clean up the metrics + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 32768, 2048); + approximate_eq(apply_res.metrics.written_keys, 256, 15); - system.shutdown(); + // update apply yeild size to 64kb + _ = cfg.update(|c| { + c.apply_yield_write_size = ReadableSize::kb(64); + Ok::<(), ()>(()) + }); + // only trigger one time of + schedule_apply(513, 512, 128); + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 65536, 4096); + approximate_eq(apply_res.metrics.written_keys, 512, 20); } #[test] @@ -5233,7 +6648,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5261,9 +6676,10 @@ mod tests { reg.region.mut_region_epoch().set_version(3); router.schedule_task(1, Msg::Registration(reg)); - // Test whether put commands and ingest commands are applied to engine in a correct order. - // We will generate 5 entries which are put, ingest, put, ingest, put respectively. For a same key, - // it can exist in multiple entries or in a single entries. We will test all all the possible + // Test whether put commands and ingest commands are applied to engine in a + // correct order. We will generate 5 entries which are put, ingest, put, + // ingest, put respectively. For a same key, it can exist in multiple + // entries or in a single entries. We will test all all the possible // keys exsiting combinations. let mut keys = Vec::new(); let keys_count = 1 << 5; @@ -5292,7 +6708,7 @@ mod tests { } } let sst_path = import_dir.path().join("test.sst"); - let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + let (mut meta, data) = gen_sst_file_with_kvs(sst_path, &kvs); meta.set_region_id(1); meta.mut_region_epoch().set_conf_ver(1); meta.mut_region_epoch().set_version(3); @@ -5323,7 +6739,7 @@ mod tests { } } let sst_path = import_dir.path().join("test2.sst"); - let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + let (mut meta, data) = gen_sst_file_with_kvs(sst_path, &kvs); meta.set_region_id(1); meta.mut_region_epoch().set_conf_ver(1); meta.mut_region_epoch().set_version(3); @@ -5380,8 +6796,8 @@ mod tests { assert!(!resp.get_header().has_error(), "{:?}", resp); } let mut res = fetch_apply_res(&rx); - // There may be one or two ApplyRes which depends on whether these two apply msgs - // are batched together. + // There may be one or two ApplyRes which depends on whether these two apply + // msgs are batched together. if res.apply_state.get_applied_index() == 3 { res = fetch_apply_res(&rx); } @@ -5394,6 +6810,326 @@ mod tests { } } + #[test] + fn test_bucket_version_change_in_try_batch() { + let (_path, engine) = create_tmp_engine("test-bucket"); + let (_, importer) = create_tmp_importer("test-bucket"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs)); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = { + let mut cfg = Config::default(); + cfg.apply_batch_system.pool_size = 1; + cfg.apply_batch_system.low_priority_pool_size = 0; + Arc::new(VersionTrack::new(cfg)) + }; + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine, + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-bucket".to_owned(), builder); + + let mut reg = Registration { + id: 1, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(1, 1)); + reg.region.set_start_key(b"k1".to_vec()); + reg.region.set_end_key(b"k2".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let entry1 = { + let mut entry = EntryBuilder::new(1, 1); + entry = entry.put(b"key1", b"value1"); + entry.epoch(1, 3).build() + }; + + let entry2 = { + let mut entry = EntryBuilder::new(2, 1); + entry = entry.put(b"key2", b"value2"); + entry.epoch(1, 3).build() + }; + + let (capture_tx, _capture_rx) = mpsc::channel(); + let mut apply1 = apply(1, 1, 1, vec![entry1], vec![cb(1, 1, capture_tx.clone())]); + let bucket_meta = BucketMeta { + region_id: 1, + region_epoch: RegionEpoch::default(), + version: 1, + keys: vec![b"".to_vec(), b"".to_vec()], + sizes: vec![0, 0], + }; + apply1.bucket_meta = Some(Arc::new(bucket_meta)); + + let mut apply2 = apply(1, 1, 1, vec![entry2], vec![cb(2, 1, capture_tx)]); + let mut bucket_meta2 = BucketMeta { + region_id: 1, + region_epoch: RegionEpoch::default(), + version: 2, + keys: vec![b"".to_vec(), b"".to_vec()], + sizes: vec![0, 0], + }; + bucket_meta2.version = 2; + apply2.bucket_meta = Some(Arc::new(bucket_meta2)); + + router.schedule_task(1, Msg::apply(apply1)); + router.schedule_task(1, Msg::apply(apply2)); + + let res = fetch_apply_res(&rx); + let bucket_version = res.bucket_stat.unwrap().as_ref().meta.version; + + assert_eq!(bucket_version, 2); + + validate(&router, 1, |delegate| { + let bucket_version = delegate.buckets.as_ref().unwrap().meta.version; + assert_eq!(bucket_version, 2); + }); + } + + #[test] + fn test_exec_observer() { + let (_path, engine) = create_tmp_engine("test-exec-observer"); + let (import_dir, importer) = create_tmp_importer("test-exec-observer"); + let mut host = CoprocessorHost::::default(); + let obs = ApplyObserver::default(); + host.registry + .register_admin_observer(1, BoxAdminObserver::new(obs.clone())); + host.registry + .register_region_change_observer(1, BoxRegionChangeObserver::new(obs.clone())); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs.clone())); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Config::default(); + let (router, mut system) = create_apply_batch_system(&cfg, None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-exec-observer".to_owned(), + cfg: Arc::new(VersionTrack::new(cfg)), + sender, + region_scheduler, + coprocessor_host: host, + importer: importer.clone(), + engine: engine.clone(), + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-exec-observer".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(1, peer_id)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + obs.skip_persist_when_pre_commit + .store(true, Ordering::SeqCst); + let mut index_id = 1; + let put_entry = EntryBuilder::new(index_id, 1) + .put(b"k1", b"v1") + .put(b"k2", b"v2") + .put(b"k3", b"v3") + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![put_entry], vec![]))); + let apply_res = fetch_apply_res(&rx); + + // We don't persist at `finish_for`, since we disabled `pre_persist`. + let state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap_or_default(); + assert_eq!( + apply_res.apply_state.get_applied_index(), + state.get_applied_index() + 1 + ); + obs.skip_persist_when_pre_commit + .store(false, Ordering::SeqCst); + + // Phase 1: we test if pre_exec will filter execution of commands correctly. + index_id += 1; + let compact_entry = EntryBuilder::new(index_id, 1) + .compact_log(index_id - 1, 2) + .epoch(1, 3) + .build(); + // Filter CompactLog + obs.filter_compact_log.store(true, Ordering::SeqCst); + router.schedule_task( + 1, + Msg::apply(apply(peer_id, 1, 1, vec![compact_entry], vec![])), + ); + let apply_res = fetch_apply_res(&rx); + // applied_index can still be advanced. + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_term, 1); + // Executing CompactLog is filtered and takes no effect. + assert_eq!(apply_res.exec_res.len(), 0); + assert_eq!(apply_res.apply_state.get_truncated_state().get_index(), 0); + + // We persist at `finish_for`, since we enabled `pre_persist`. + let state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap_or_default(); + assert_eq!( + apply_res.apply_state.get_applied_index(), + state.get_applied_index() + ); + + index_id += 1; + // Don't filter CompactLog + obs.filter_compact_log.store(false, Ordering::SeqCst); + let compact_entry = EntryBuilder::new(index_id, 1) + .compact_log(index_id - 1, 2) + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply(peer_id, 1, 1, vec![compact_entry], vec![])), + ); + let apply_res = fetch_apply_res(&rx); + // applied_index can still be advanced. + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_term, 1); + // We can get exec result of CompactLog. + assert_eq!(apply_res.exec_res.len(), 1); + assert_eq!( + apply_res.apply_state.get_truncated_state().get_index(), + index_id - 1 + ); + + index_id += 1; + obs.filter_consistency_check.store(true, Ordering::SeqCst); + let compute_hash_entry = EntryBuilder::new(index_id, 1).compute_hash(vec![]).build(); + router.schedule_task( + 1, + Msg::apply(apply(peer_id, 1, 1, vec![compute_hash_entry], vec![])), + ); + let apply_res = fetch_apply_res(&rx); + // applied_index can still be advanced. + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_term, 1); + // We can't get exec result of ComputeHash. + assert_eq!(apply_res.exec_res.len(), 0); + obs.filter_consistency_check.store(false, Ordering::SeqCst); + + // Phase 2: we test if post_exec will persist when need. + // We choose BatchSplit in order to make sure `modified_region` is filled. + index_id += 1; + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(true); + splits.mut_requests().push(new_split_req(b"k2", 8, vec![7])); + let split = EntryBuilder::new(index_id, 1) + .split(splits) + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![split], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_term, 1); + let (r1, r8) = if let ExecResult::SplitRegion { + regions, + derived: _, + new_split_regions: _, + } = apply_res.exec_res.front().unwrap() + { + let r8 = regions.get(0).unwrap(); + let r1 = regions.get(1).unwrap(); + assert_eq!(r8.get_id(), 8); + assert_eq!(r1.get_id(), 1); + (r1, r8) + } else { + panic!("error split exec_res"); + }; + + index_id += 1; + let merge = EntryBuilder::new(index_id, 1) + .prepare_merge(r8.clone()) + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![merge], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_term, 1); + // PrepareMerge will trigger commit. + let state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap_or_default(); + assert_eq!(apply_res.apply_state, state); + + // Phase 3: we test if we can delay deletion of some sst files. + let r1_epoch = r1.get_region_epoch(); + index_id += 1; + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"2")]; + let sst_path = import_dir.path().join("test.sst"); + let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + meta.set_region_id(1); + meta.set_region_epoch(r1_epoch.clone()); + let mut file = importer.create(&meta).unwrap(); + file.append(&data).unwrap(); + file.finish().unwrap(); + let src = sst_path.clone(); + let dst = file.get_import_path().save.to_str().unwrap(); + std::fs::copy(src, dst).unwrap(); + assert!(sst_path.as_path().exists()); + let ingestsst = EntryBuilder::new(index_id, 1) + .ingest_sst(&meta) + .epoch(r1_epoch.get_conf_ver(), r1_epoch.get_version()) + .build(); + + obs.delay_remove_ssts.store(true, Ordering::SeqCst); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![ingestsst], vec![]))); + fetch_apply_res(&rx); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.exec_res.len(), 1); + assert_eq!(obs.last_pending_handle_sst_count.load(Ordering::SeqCst), 0); + assert_eq!(obs.last_delete_sst_count.load(Ordering::SeqCst), 0); + assert_eq!(obs.last_pending_delete_sst_count.load(Ordering::SeqCst), 1); + + index_id += 1; + let ingestsst = EntryBuilder::new(index_id, 1) + .ingest_sst(&meta) + .epoch(r1_epoch.get_conf_ver(), r1_epoch.get_version()) + .build(); + obs.delay_remove_ssts.store(false, Ordering::SeqCst); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![ingestsst], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.exec_res.len(), 1); + assert_eq!(obs.last_pending_handle_sst_count.load(Ordering::SeqCst), 0); + assert_eq!(obs.last_delete_sst_count.load(Ordering::SeqCst), 1); + assert_eq!(obs.last_pending_delete_sst_count.load(Ordering::SeqCst), 1); + + system.shutdown(); + } + #[test] fn test_cmd_observer() { let (_path, engine) = create_tmp_engine("test-delegate"); @@ -5409,7 +7145,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5471,7 +7207,7 @@ mod tests { Msg::Change { region_epoch: region_epoch.clone(), cmd: ChangeObserver::from_cdc(1, observe_handle.clone()), - cb: Callback::Read(Box::new(|resp: ReadResponse| { + cb: Callback::read(Box::new(|resp: ReadResponse| { assert!(!resp.response.get_header().has_error()); assert!(resp.snapshot.is_some()); let snap = resp.snapshot.unwrap(); @@ -5540,7 +7276,7 @@ mod tests { Msg::Change { region_epoch, cmd: ChangeObserver::from_cdc(2, observe_handle), - cb: Callback::Read(Box::new(|resp: ReadResponse<_>| { + cb: Callback::read(Box::new(|resp: ReadResponse<_>| { assert!( resp.response .get_header() @@ -5561,29 +7297,29 @@ mod tests { let mut region = Region::default(); // Check uuid and cf name - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.set_uuid(Uuid::new_v4().as_bytes().to_vec()); sst.set_cf_name(CF_DEFAULT.to_owned()); check_sst_for_ingestion(&sst, ®ion).unwrap(); sst.set_cf_name("test".to_owned()); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.set_cf_name(CF_WRITE.to_owned()); check_sst_for_ingestion(&sst, ®ion).unwrap(); // Check region id region.set_id(1); sst.set_region_id(2); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.set_region_id(1); check_sst_for_ingestion(&sst, ®ion).unwrap(); // Check region epoch region.mut_region_epoch().set_conf_ver(1); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_region_epoch().set_conf_ver(1); check_sst_for_ingestion(&sst, ®ion).unwrap(); region.mut_region_epoch().set_version(1); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_region_epoch().set_version(1); check_sst_for_ingestion(&sst, ®ion).unwrap(); @@ -5592,9 +7328,9 @@ mod tests { region.set_end_key(vec![8]); sst.mut_range().set_start(vec![1]); sst.mut_range().set_end(vec![8]); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_range().set_start(vec![2]); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_range().set_end(vec![7]); check_sst_for_ingestion(&sst, ®ion).unwrap(); } @@ -5689,7 +7425,7 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(obs)); let (region_scheduler, _) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5712,7 +7448,7 @@ mod tests { Msg::Change { region_epoch: region_epoch.clone(), cmd: ChangeObserver::from_cdc(1, observe_handle.clone()), - cb: Callback::Read(Box::new(|resp: ReadResponse<_>| { + cb: Callback::read(Box::new(|resp: ReadResponse<_>| { assert!(!resp.response.get_header().has_error(), "{:?}", resp); assert!(resp.snapshot.is_some()); })), @@ -5767,12 +7503,13 @@ mod tests { resp ); + splits.mut_requests().clear(); splits .mut_requests() .push(new_split_req(b"", 8, vec![9, 10, 11])); let resp = exec_split(&router, splits.clone()); - // Empty key should be rejected. - assert!(error_msg(&resp).contains("missing"), "{:?}", resp); + // Empty key will not in any region exclusively. + assert!(error_msg(&resp).contains("missing split key"), "{:?}", resp); splits.mut_requests().clear(); splits @@ -5867,7 +7604,7 @@ mod tests { Msg::Change { region_epoch, cmd: ChangeObserver::from_cdc(1, observe_handle), - cb: Callback::Read(Box::new(move |resp: ReadResponse<_>| { + cb: Callback::read(Box::new(move |resp: ReadResponse<_>| { assert!( resp.response.get_header().get_error().has_epoch_not_match(), "{:?}", @@ -5886,7 +7623,7 @@ mod tests { #[test] fn pending_cmd_leak() { let res = panic_hook::recover_safe(|| { - let _cmd = PendingCmd::::new(1, 1, Callback::None); + let _cmd = PendingCmd::new(1, 1, Callback::::None); }); res.unwrap_err(); } @@ -5894,10 +7631,197 @@ mod tests { #[test] fn pending_cmd_leak_dtor_not_abort() { let res = panic_hook::recover_safe(|| { - let _cmd = PendingCmd::::new(1, 1, Callback::None); + let _cmd = PendingCmd::new(1, 1, Callback::::None); panic!("Don't abort"); - // It would abort and fail if there was a double-panic in PendingCmd dtor. + // It would abort and fail if there was a double-panic in PendingCmd + // dtor. }); res.unwrap_err(); } + + #[test] + fn flashback_need_to_be_applied() { + let (_path, engine) = create_tmp_engine("flashback_need_to_be_applied"); + let (_, importer) = create_tmp_importer("flashback_need_to_be_applied"); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(ApplyObserver::default())); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "flashback_need_to_be_applied".to_owned(), + cfg, + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine: engine.clone(), + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("flashback_need_to_be_applied".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + reg.region.set_is_in_flashback(true); + router.schedule_task(1, Msg::Registration(reg)); + + let (capture_tx, capture_rx) = mpsc::channel(); + let mut region_state = RegionLocalState::default(); + region_state.mut_region().set_is_in_flashback(false); + let region_state_key = keys::region_state_key(1); + engine + .put_msg_cf(CF_RAFT, ®ion_state_key, ®ion_state) + .unwrap(); + // Check for not flashback request. + let mut cmd = AdminRequest::default(); + cmd.set_cmd_type(AdminCmdType::TransferLeader); + let mut flashback_req = EntryBuilder::new(1, 1).epoch(1, 3); + flashback_req.req.set_admin_request(cmd.clone()); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![flashback_req.build()], + vec![cb(1, 1, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_flashback_in_progress()); + // Check for flashback request. + cmd.set_cmd_type(AdminCmdType::PrepareFlashback); + region_state.mut_region().set_is_in_flashback(false); + let mut flashback_req = EntryBuilder::new(2, 2).epoch(1, 3); + flashback_req.req.set_admin_request(cmd.clone()); + flashback_req + .req + .mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 2, + vec![flashback_req.build()], + vec![cb(2, 2, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + rx.recv_timeout(Duration::from_millis(500)).unwrap(); + system.shutdown(); + } + + fn new_batch_split_request(keys: Vec>) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + for key in keys { + let mut split_req = SplitRequest::default(); + split_req.set_split_key(key); + split_req.set_new_peer_ids(vec![1]); + req.mut_splits().mut_requests().push(split_req); + } + req + } + + #[test] + fn test_validate_batch_split() { + let mut region = Region::default(); + region.set_start_key(b"k05".to_vec()); + region.set_end_key(b"k10".to_vec()); + region.set_peers(vec![new_peer(1, 2)].into()); + + let missing_error = "missing split requests"; + let invalid_error = "invalid split request"; + let not_in_region_error = "not in region"; + let empty_error = "missing split key"; + let peer_id_error = "invalid new peer id count"; + + // case: split is deprecated + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::Split); + let mut split_req = SplitRequest::default(); + split_req.set_split_key(b"k06".to_vec()); + req.set_split(split_req); + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(missing_error) + ); + + // case: missing peer ids + let mut req = new_batch_split_request(vec![b"k07".to_vec()]); + req.mut_splits() + .mut_requests() + .get_mut(0) + .unwrap() + .new_peer_ids + .clear(); + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(peer_id_error) + ); + + let fail_cases = vec![ + // case: default admin request should be rejected + (vec![], missing_error), + // case: empty split key + (vec![vec![]], empty_error), + // case: out of order split keys + ( + vec![b"k07".to_vec(), b"k08".to_vec(), b"k06".to_vec()], + invalid_error, + ), + // case: split keys are not in region range + ( + vec![b"k04".to_vec(), b"k07".to_vec(), b"k08".to_vec()], + invalid_error, + ), + // case: split keys are not in region range + ( + vec![b"k06".to_vec(), b"k07".to_vec(), b"k11".to_vec()], + not_in_region_error, + ), + // case: duplicated split keys + (vec![b"k06".to_vec(), b"k06".to_vec()], invalid_error), + ]; + + for (split_keys, fail_str) in fail_cases { + let req = if split_keys.is_empty() { + AdminRequest::default() + } else { + new_batch_split_request(split_keys) + }; + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(fail_str) + ); + } + + // case: pass the validation + let req = new_batch_split_request(vec![b"k06".to_vec(), b"k07".to_vec(), b"k08".to_vec()]); + validate_batch_split(&req, ®ion).unwrap(); + } } diff --git a/components/raftstore/src/store/fsm/life.rs b/components/raftstore/src/store/fsm/life.rs new file mode 100644 index 00000000000..59aa8b316f0 --- /dev/null +++ b/components/raftstore/src/store/fsm/life.rs @@ -0,0 +1,92 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains functions that relates to peer liftime management and +//! are shared with raftstore and raftstore v2. + +use engine_traits::{KvEngine, CF_RAFT}; +use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}; + +use crate::store::util::is_epoch_stale; + +/// Tell leader that `to_peer` from `tombstone_msg` is destroyed. +pub fn build_peer_destroyed_report(tombstone_msg: &mut RaftMessage) -> Option { + let to_region_id = if tombstone_msg.has_extra_msg() { + assert_eq!( + tombstone_msg.get_extra_msg().get_type(), + ExtraMessageType::MsgGcPeerRequest + ); + tombstone_msg + .get_extra_msg() + .get_check_gc_peer() + .get_from_region_id() + } else { + tombstone_msg.get_region_id() + }; + if to_region_id == 0 || tombstone_msg.get_from_peer().get_id() == 0 { + return None; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(to_region_id); + msg.set_from_peer(tombstone_msg.take_to_peer()); + msg.set_to_peer(tombstone_msg.take_from_peer()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgGcPeerResponse); + Some(msg) +} + +/// Forward the destroy request from target peer to merged source peer. +pub fn forward_destroy_to_source_peer(msg: &RaftMessage, forward: T) { + let extra_msg = msg.get_extra_msg(); + // Instead of respond leader directly, send a message to target region to + // double check it's really destroyed. + let check_gc_peer = extra_msg.get_check_gc_peer(); + let mut tombstone_msg = RaftMessage::default(); + tombstone_msg.set_region_id(check_gc_peer.get_check_region_id()); + tombstone_msg.set_from_peer(msg.get_from_peer().clone()); + tombstone_msg.set_to_peer(check_gc_peer.get_check_peer().clone()); + tombstone_msg.set_region_epoch(check_gc_peer.get_check_region_epoch().clone()); + tombstone_msg.set_is_tombstone(true); + // No need to set epoch as we don't know what it is. + // This message will not be handled by `on_gc_peer_request` due to + // `is_tombstone` being true. + tombstone_msg + .mut_extra_msg() + .set_type(ExtraMessageType::MsgGcPeerRequest); + tombstone_msg + .mut_extra_msg() + .mut_check_gc_peer() + .set_from_region_id(check_gc_peer.get_from_region_id()); + forward(tombstone_msg); +} + +pub fn handle_tombstone_message_on_learner( + engine: &EK, + store_id: u64, + mut msg: RaftMessage, +) -> Option { + let region_id = msg.get_region_id(); + let region_state_key = keys::region_state_key(region_id); + let local_state: RegionLocalState = match engine.get_msg_cf(CF_RAFT, ®ion_state_key) { + Ok(Some(s)) => s, + e => panic!( + "[store {}] failed to get regions state of {:?}: {:?}", + store_id, + msg.get_region_id(), + e + ), + }; + + if local_state.get_state() != PeerState::Tombstone { + return None; + } + + // In v2, we rely on leader to confirm destroy actively. + let local_epoch = local_state.get_region().get_region_epoch(); + // The region in this peer is already destroyed + if msg.get_region_epoch() == local_epoch || is_epoch_stale(msg.get_region_epoch(), local_epoch) + { + return build_peer_destroyed_report(&mut msg); + } + + None +} diff --git a/components/raftstore/src/store/fsm/metrics.rs b/components/raftstore/src/store/fsm/metrics.rs index a866a70175c..6ee346bfd75 100644 --- a/components/raftstore/src/store/fsm/metrics.rs +++ b/components/raftstore/src/store/fsm/metrics.rs @@ -8,8 +8,7 @@ use std::sync::{ use lazy_static::lazy_static; use prometheus::{exponential_buckets, register_histogram, Histogram}; - -use crate::store::QueryStats; +use tikv_util::store::QueryStats; lazy_static! { pub static ref APPLY_PROPOSAL: Histogram = register_histogram!( diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index 731ad5209b4..f342c1ec733 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -5,18 +5,22 @@ //! stores. They are mixed for now, will be separated in the future. pub mod apply; +pub mod life; mod metrics; mod peer; pub mod store; pub use self::{ apply::{ - create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, ApplyRes, ApplyRouter, - Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, ChangePeer, ExecResult, - GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, Registration, - TaskRes as ApplyTaskRes, + check_sst_for_ingestion, create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, + ApplyRes, ApplyRouter, Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, + ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, + Registration, SwitchWitness, TaskRes as ApplyTaskRes, + }, + metrics::{GlobalStoreStat, LocalStoreStat}, + peer::{ + new_admin_request, new_read_index_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO, }, - peer::{DestroyPeerJob, PeerFsm}, store::{ create_raft_batch_system, RaftBatchSystem, RaftPollerBuilder, RaftRouter, StoreInfo, StoreMeta, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index c61e3c3ba55..72eb3c59753 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -12,7 +12,7 @@ use std::{ iter::{FromIterator, Iterator}, mem, sync::{Arc, Mutex}, - time::Instant, + time::{Duration, Instant}, u64, }; @@ -21,8 +21,10 @@ use collections::{HashMap, HashSet}; use engine_traits::{Engines, KvEngine, RaftEngine, SstMetaInfo, WriteBatchExt, CF_LOCK, CF_RAFT}; use error_code::ErrorCodeExt; use fail::fail_point; +use futures::channel::mpsc::UnboundedSender; use keys::{self, enc_end_key, enc_start_key}; use kvproto::{ + brpb::CheckAdminResponse, errorpb, import_sstpb::SwitchMode, kvrpcpb::DiskFullOpt, @@ -33,13 +35,13 @@ use kvproto::{ StatusCmdType, StatusResponse, }, raft_serverpb::{ - ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftApplyState, RaftMessage, - RaftSnapshotData, RaftTruncatedState, RegionLocalState, + ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftMessage, RaftSnapshotData, + RaftTruncatedState, RegionLocalState, }, replication_modepb::{DrAutoSyncState, ReplicationMode}, }; use parking_lot::RwLockWriteGuard; -use pd_client::{merge_bucket_stats, new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use protobuf::Message; use raft::{ self, @@ -51,49 +53,53 @@ use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, - sys::{disk::DiskUsage, memory_usage_reaches_high_water}, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, + sys::disk::DiskUsage, + time::{monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, }; +use tracker::GLOBAL_TRACKERS; use txn_types::WriteBatchFlags; use self::memtrace::*; +use super::life::forward_destroy_to_source_peer; #[cfg(any(test, feature = "testexport"))] use crate::store::PeerInternalStat; use crate::{ coprocessor::{RegionChangeEvent, RegionChangeReason}, store::{ cmd_resp::{bind_term, new_error}, + entry_storage::MAX_WARMED_UP_CACHE_KEEP_TIME, fsm::{ apply, store::{PollContext, StoreMeta}, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, + ExecResult, SwitchWitness, }, hibernate_state::{GroupState, HibernateState}, - local_metrics::RaftMetrics, + local_metrics::{RaftMetrics, TimeTracker}, memory::*, metrics::*, msg::{Callback, ExtCallback, InspectedRaftMessage}, peer::{ - ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, - UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, - UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, - TRANSFER_LEADER_COMMAND_REPLY_CTX, + ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, SnapshotRecoveryState, + SnapshotRecoveryWaitApplySyncer, StaleState, UnsafeRecoveryExecutePlanSyncer, + UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, + UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, + region_meta::RegionMeta, transport::Transport, util, - util::{is_learner, KeysInfoFormatter, LeaseState}, + util::{KeysInfoFormatter, LeaseState}, worker::{ new_change_peer_v2_request, Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, - GcSnapshotTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, - RegionTask, SplitCheckTask, + GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }, - AbstractPeer, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, - PeerTick, ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, - SignificantMsg, SnapKey, StoreMsg, + CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, + ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, ReadTask, + SignificantMsg, SnapKey, StoreMsg, WriteCallback, }, Error, Result, }; @@ -113,10 +119,13 @@ enum DelayReason { /// Limits the maximum number of regions returned by error. /// -/// Another choice is using coprocessor batch limit, but 10 should be a good fit in most case. +/// Another choice is using coprocessor batch limit, but 10 should be a good fit +/// in most case. const MAX_REGIONS_IN_ERROR: usize = 10; const REGION_SPLIT_SKIP_MAX_COUNT: usize = 3; +pub const MAX_PROPOSAL_SIZE_RATIO: f64 = 0.4; + pub struct DestroyPeerJob { pub initialized: bool, pub region_id: u64, @@ -129,13 +138,15 @@ where ER: RaftEngine, { pub peer: Peer, - /// A registry for all scheduled ticks. This can avoid scheduling ticks twice accidentally. + /// A registry for all scheduled ticks. This can avoid scheduling ticks + /// twice accidentally. tick_registry: [bool; PeerTick::VARIANT_COUNT], /// Ticks for speed up campaign in chaos state. /// - /// Followers will keep ticking in Idle mode to measure how many ticks have been skipped. - /// Once it becomes chaos, those skipped ticks will be ticked so that it can campaign - /// quickly instead of waiting an election timeout. + /// Followers will keep ticking in Idle mode to measure how many ticks have + /// been skipped. Once it becomes chaos, those skipped ticks will be + /// ticked so that it can campaign quickly instead of waiting an + /// election timeout. /// /// This will be reset to 0 once it receives any messages from leader. missing_ticks: usize, @@ -144,11 +155,12 @@ where has_ready: bool, mailbox: Option>>, pub receiver: Receiver>, - /// when snapshot is generating or sending, skip split check at most REGION_SPLIT_SKIT_MAX_COUNT times. + /// when snapshot is generating or sending, skip split check at most + /// REGION_SPLIT_SKIT_MAX_COUNT times. skip_split_count: usize, - /// Sometimes applied raft logs won't be compacted in time, because less compact means less - /// sync-log in apply threads. Stale logs will be deleted if the skip time reaches this - /// `skip_gc_raft_log_ticks`. + /// Sometimes applied raft logs won't be compacted in time, because less + /// compact means less sync-log in apply threads. Stale logs will be + /// deleted if the skip time reaches this `skip_gc_raft_log_ticks`. skip_gc_raft_log_ticks: usize, reactivate_memory_lock_ticks: usize, @@ -160,8 +172,8 @@ where /// Destroy is delayed because of some unpersisted readies in Peer. /// Should call `destroy_peer` again after persisting all readies. delayed_destroy: Option, - /// Before actually destroying a peer, ensure all log gc tasks are finished, so we - /// can start destroying without seeking. + /// Before actually destroying a peer, ensure all log gc tasks are finished, + /// so we can start destroying without seeking. logs_gc_flushed: bool, } @@ -233,11 +245,12 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, + wait_data: bool, ) -> Result> { - let meta_peer = match util::find_peer(region, store_id) { + let meta_peer = match find_peer(region, store_id) { None => { return Err(box_err!( "find no peer for store {} in region {:?}", @@ -266,6 +279,7 @@ where engines, region, meta_peer, + wait_data, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -285,14 +299,14 @@ where )) } - // The peer can be created from another node with raft membership changes, and we only - // know the region_id and peer_id when creating this replicated peer, the region info - // will be retrieved later after applying snapshot. + // The peer can be created from another node with raft membership changes, and + // we only know the region_id and peer_id when creating this replicated peer, + // the region info will be retrieved later after applying snapshot. pub fn replicate( store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region_id: u64, peer: metapb::Peer, @@ -320,6 +334,7 @@ where engines, ®ion, peer, + false, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -458,9 +473,11 @@ where fn should_finish(&self, cfg: &Config) -> bool { if let Some(batch_req) = self.request.as_ref() { - // Limit the size of batch request so that it will not exceed raft_entry_max_size after - // adding header. - if self.batch_req_size > (cfg.raft_entry_max_size.0 as f64 * 0.4) as u64 { + // Limit the size of batch request so that it will not exceed + // raft_entry_max_size after adding header. + if self.batch_req_size + > (cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as u64 + { return true; } if batch_req.get_requests().len() > ::WRITE_BATCH_MAX_KEYS { @@ -482,17 +499,11 @@ where let cb = self.callbacks.pop().unwrap(); return Some((req, cb)); } - metric.propose.batch += self.callbacks.len() - 1; + metric.propose.batch.inc_by(self.callbacks.len() as u64 - 1); let mut cbs = std::mem::take(&mut self.callbacks); let proposed_cbs: Vec = cbs .iter_mut() - .filter_map(|cb| { - if let Callback::Write { proposed_cb, .. } = cb { - proposed_cb.take() - } else { - None - } - }) + .filter_map(|cb| cb.take_proposed_cb()) .collect(); let proposed_cb: Option = if proposed_cbs.is_empty() { None @@ -505,13 +516,7 @@ where }; let committed_cbs: Vec<_> = cbs .iter_mut() - .filter_map(|cb| { - if let Callback::Write { committed_cb, .. } = cb { - committed_cb.take() - } else { - None - } - }) + .filter_map(|cb| cb.take_committed_cb()) .collect(); let committed_cb: Option = if committed_cbs.is_empty() { None @@ -523,19 +528,14 @@ where })) }; - let times: SmallVec<[TiInstant; 4]> = cbs + let trackers: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() - .filter_map(|cb| { - if let Callback::Write { request_times, .. } = cb { - Some(request_times[0]) - } else { - None - } - }) + .flat_map(|cb| cb.write_trackers()) + .cloned() .collect(); - let mut cb = Callback::write_ext( - Box::new(move |resp| { + let cb = Callback::Write { + cb: Box::new(move |resp| { for cb in cbs { let mut cmd_resp = RaftCmdResponse::default(); cmd_resp.set_header(resp.response.get_header().clone()); @@ -544,12 +544,8 @@ where }), proposed_cb, committed_cb, - ); - - if let Callback::Write { request_times, .. } = &mut cb { - *request_times = times; - } - + trackers, + }; return Some((req, cb)); } None @@ -568,7 +564,7 @@ where self.stopped } - /// Set a mailbox to Fsm, which should be used to send message to itself. + /// Set a mailbox to FSM, which should be used to send message to itself. #[inline] fn set_mailbox(&mut self, mailbox: Cow<'_, BasicMailbox>) where @@ -577,7 +573,7 @@ where self.mailbox = Some(mailbox.into_owned()); } - /// Take the mailbox from Fsm. Implementation should ensure there will be + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. #[inline] fn take_mailbox(&mut self) -> Option> @@ -610,9 +606,14 @@ where } pub fn handle_msgs(&mut self, msgs: &mut Vec>) { + let timer = TiInstant::now_coarse(); + let count = msgs.len(); for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -622,11 +623,18 @@ where } } PeerMsg::RaftCommand(cmd) => { + let propose_time = cmd.send_time.saturating_elapsed(); self.ctx .raft_metrics - .propose - .request_wait_time - .observe(duration_to_sec(cmd.send_time.saturating_elapsed()) as f64); + .propose_wait_time + .observe(propose_time.as_secs_f64()); + cmd.callback.read_tracker().map(|tracker| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { + t.metrics.read_index_propose_wait_nanos = + propose_time.as_nanos() as u64; + }) + }); + if let Some(Err(e)) = cmd.extra_opts.deadline.map(|deadline| deadline.check()) { cmd.callback.invoke_with_response(new_error(e.into())); continue; @@ -639,19 +647,19 @@ where // so that normal writes can be rejected when proposing if the // store's disk is full. && ((self.ctx.self_disk_usage == DiskUsage::Normal - && !self.fsm.peer.disk_full_peers.majority()) - || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) + && !self.fsm.peer.disk_full_peers.majority()) + || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) { self.fsm.batch_req_builder.add(cmd, req_size); if self.fsm.batch_req_builder.should_finish(&self.ctx.cfg) { - self.propose_batch_raft_command(true); + self.propose_pending_batch_raft_command(); } } else { self.propose_raft_command( cmd.request, cmd.callback, cmd.extra_opts.disk_full_opt, - ) + ); } } PeerMsg::Tick(tick) => self.on_tick(tick), @@ -675,7 +683,7 @@ where PeerMsg::Destroy(peer_id) => { if self.fsm.peer.peer_id() == peer_id { match self.fsm.peer.maybe_destroy(self.ctx) { - None => self.ctx.raft_metrics.message_dropped.applying_snap += 1, + None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), Some(job) => { self.handle_destroy_peer(job); } @@ -684,53 +692,63 @@ where } } } + self.on_loop_finished(); + self.ctx.raft_metrics.peer_msg_len.observe(count as f64); + self.ctx + .raft_metrics + .event_time + .peer_msg + .observe(timer.saturating_elapsed_secs()); + } + + #[inline] + fn on_loop_finished(&mut self) { + let ready_concurrency = self.ctx.cfg.cmd_batch_concurrent_ready_max_count; + let should_propose = self.ctx.sync_write_worker.is_some() + || ready_concurrency == 0 + || self.fsm.peer.unpersisted_ready_len() < ready_concurrency; + let force_delay_fp = || { + fail_point!( + "force_delay_propose_batch_raft_command", + self.ctx.sync_write_worker.is_none(), + |_| true + ); + false + }; // Propose batch request which may be still waiting for more raft-command - if self.ctx.sync_write_worker.is_some() { - self.propose_batch_raft_command(true); - } else { - self.propose_batch_raft_command(false); - self.check_batch_cmd_and_proposed_cb(); + if should_propose && !force_delay_fp() { + self.propose_pending_batch_raft_command(); + } else if self.fsm.batch_req_builder.has_proposed_cb + && self.fsm.batch_req_builder.propose_checked.is_none() + && let Some(cmd) = self.fsm.batch_req_builder.request.take() + { + // We are delaying these requests to next loop. Try to fulfill their + // proposed callback early. + self.fsm.batch_req_builder.propose_checked = Some(false); + if let Ok(None) = self.pre_propose_raft_command(&cmd) { + if self.fsm.peer.will_likely_propose(&cmd) { + self.fsm.batch_req_builder.propose_checked = Some(true); + for cb in &mut self.fsm.batch_req_builder.callbacks { + cb.invoke_proposed(); + } + } + } + self.fsm.batch_req_builder.request = Some(cmd); } } - fn propose_batch_raft_command(&mut self, force: bool) { + /// Flushes all pending raft commands for immediate execution. + #[inline] + fn propose_pending_batch_raft_command(&mut self) { if self.fsm.batch_req_builder.request.is_none() { return; } - if !force - && self.ctx.cfg.cmd_batch_concurrent_ready_max_count != 0 - && self.fsm.peer.unpersisted_ready_len() - >= self.ctx.cfg.cmd_batch_concurrent_ready_max_count - { - return; - } - fail_point!("propose_batch_raft_command", !force, |_| {}); let (request, callback) = self .fsm .batch_req_builder .build(&mut self.ctx.raft_metrics) .unwrap(); - self.propose_raft_command_internal(request, callback, DiskFullOpt::NotAllowedOnFull) - } - - fn check_batch_cmd_and_proposed_cb(&mut self) { - if self.fsm.batch_req_builder.request.is_none() - || !self.fsm.batch_req_builder.has_proposed_cb - || self.fsm.batch_req_builder.propose_checked.is_some() - { - return; - } - let cmd = self.fsm.batch_req_builder.request.take().unwrap(); - self.fsm.batch_req_builder.propose_checked = Some(false); - if let Ok(None) = self.pre_propose_raft_command(&cmd) { - if self.fsm.peer.will_likely_propose(&cmd) { - self.fsm.batch_req_builder.propose_checked = Some(true); - for cb in &mut self.fsm.batch_req_builder.callbacks { - cb.invoke_proposed(); - } - } - } - self.fsm.batch_req_builder.request = Some(cmd); + self.propose_raft_command_internal(request, callback, DiskFullOpt::NotAllowedOnFull); } fn on_update_replication_mode(&mut self) { @@ -877,9 +895,9 @@ where return; } let target_index = if self.fsm.peer.force_leader.is_some() { - // For regions that lose quorum (or regions have force leader), whatever has been - // proposed will be committed. Based on that fact, we simply use "last index" here to - // avoid implementing another "wait commit" process. + // For regions that lose quorum (or regions have force leader), whatever has + // been proposed will be committed. Based on that fact, we simply use "last + // index" here to avoid implementing another "wait commit" process. self.fsm.peer.raft_group.raft.raft_log.last_index() } else { self.fsm.peer.raft_group.raft.raft_log.committed @@ -891,7 +909,63 @@ where }); self.fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ self.fsm.stopped); + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + } + + // func be invoked firstly after assigned leader by BR, wait all leader apply to + // last log index func be invoked secondly wait follower apply to last + // index, however the second call is broadcast, it may improve in future + fn on_snapshot_recovery_wait_apply(&mut self, syncer: SnapshotRecoveryWaitApplySyncer) { + if self.fsm.peer.snapshot_recovery_state.is_some() { + warn!( + "can't wait apply, another recovery in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; + } + + let target_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + + // during the snapshot recovery, broadcast waitapply, some peer may stale + if !self.fsm.peer.is_leader() { + info!( + "snapshot follower recovery started"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied_index" => self.fsm.peer.raft_group.raft.raft_log.applied, + "pending_remove" => self.fsm.peer.pending_remove, + "voter" => self.fsm.peer.raft_group.raft.vote, + ); + + // do some sanity check, for follower, leader already apply to last log, + // case#1 if it is learner during backup and never vote before, vote is 0 + // case#2 if peer is suppose to remove + if self.fsm.peer.raft_group.raft.vote == 0 || self.fsm.peer.pending_remove { + info!( + "this peer is never vote before or pending remove, it should be skip to wait apply" + ); + return; + } + } else { + info!( + "snapshot leader wait apply started"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied_index" => self.fsm.peer.raft_group.raft.raft_log.applied, + ); + } + + self.fsm.peer.snapshot_recovery_state = Some(SnapshotRecoveryState::WaitLogApplyToLast { + target_index, + syncer, + }); + self.fsm + .peer + .snapshot_recovery_maybe_finish_wait_apply(self.fsm.stopped); } fn on_unsafe_recovery_fill_out_report(&mut self, syncer: UnsafeRecoveryFillOutReportSyncer) { @@ -924,6 +998,39 @@ where syncer.report_for_self(self_report); } + fn on_check_pending_admin(&mut self, ch: UnboundedSender) { + if !self.fsm.peer.is_leader() { + // no need to check non-leader pending conf change. + // in snapshot recovery after we stopped all conf changes from PD. + // if the follower slow than leader and has the pending conf change. + // that's means + // 1. if the follower didn't finished the conf change + // => it cannot be chosen to be leader during recovery. + // 2. if the follower has been chosen to be leader + // => it already apply the pending conf change already. + return; + } + debug!( + "check pending conf for leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer_id(), + ); + let region = self.fsm.peer.region(); + let mut resp = CheckAdminResponse::default(); + resp.set_region(region.clone()); + let pending_admin = self.fsm.peer.raft_group.raft.has_pending_conf() + || self.fsm.peer.is_merging() + || self.fsm.peer.is_splitting(); + resp.set_has_pending_admin(pending_admin); + if let Err(err) = ch.unbounded_send(resp) { + warn!("failed to send check admin response"; + "err" => ?err, + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer_id(), + ); + } + } + fn on_casual_msg(&mut self, msg: CasualMessage) { match msg { CasualMessage::SplitRegion { @@ -960,11 +1067,20 @@ where } CasualMessage::HalfSplitRegion { region_epoch, + start_key, + end_key, policy, source, cb, } => { - self.on_schedule_half_split_region(®ion_epoch, policy, source, cb); + self.on_schedule_half_split_region( + ®ion_epoch, + start_key, + end_key, + policy, + source, + cb, + ); } CasualMessage::GcSnap { snaps } => { self.on_gc_snap(snaps); @@ -980,8 +1096,9 @@ where if is_learner(&self.fsm.peer.peer) { // FIXME: should use `bcast_check_stale_peer_message` instead. - // Sending a new enum type msg to a old tikv may cause panic during rolling update - // we should change the protobuf behavior and check if properly handled in all place + // Sending a new enum type msg to a old tikv may cause panic during rolling + // update we should change the protobuf behavior and check if properly handled + // in all place self.fsm.peer.bcast_wake_up_message(self.ctx); } } @@ -993,7 +1110,26 @@ where CasualMessage::ForceCompactRaftLogs => { self.on_raft_gc_log_tick(true); } - CasualMessage::AccessPeer(cb) => cb(self.fsm as &mut dyn AbstractPeer), + CasualMessage::AccessPeer(cb) => { + let peer = &self.fsm.peer; + let store = peer.get_store(); + let mut local_state = RegionLocalState::default(); + local_state.set_region(store.region().clone()); + if let Some(s) = &peer.pending_merge_state { + local_state.set_merge_state(s.clone()); + } + if store.is_applying_snapshot() { + local_state.set_state(PeerState::Applying); + } + cb(RegionMeta::new( + &local_state, + store.apply_state(), + self.fsm.hibernate_state.group_state(), + peer.raft_group.status(), + peer.raft_group.raft.raft_log.last_index(), + peer.raft_group.raft.raft_log.persisted, + )) + } CasualMessage::QueryRegionLeaderResp { region, leader } => { // the leader already updated if self.fsm.peer.raft_group.raft.leader_id != raft::INVALID_ID @@ -1001,8 +1137,7 @@ where || util::is_epoch_stale( region.get_region_epoch(), self.fsm.peer.region().get_region_epoch(), - ) - { + ) { // Stale message return; } @@ -1062,6 +1197,10 @@ where PeerTick::CheckLeaderLease => self.on_check_leader_lease_tick(), PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), + PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), + PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), + PeerTick::RequestSnapshot => self.on_request_snapshot_tick(), + PeerTick::RequestVoterReplicatedIndex => self.on_request_voter_replicated_index(), } } @@ -1072,6 +1211,9 @@ where self.register_split_region_check_tick(); self.register_check_peer_stale_state_tick(); self.on_check_merge(); + if self.fsm.peer.wait_data { + self.on_request_snapshot_tick(); + } // Apply committed entries more quickly. // Or if it's a leader. This implicitly means it's a singleton // because it becomes leader in `Peer::new` when it's a @@ -1084,6 +1226,9 @@ where self.fsm.has_ready = true; } self.fsm.peer.maybe_gen_approximate_buckets(self.ctx); + if self.fsm.peer.is_witness() { + self.register_pull_voter_replicated_index_tick(); + } } fn on_gc_snap(&mut self, snaps: Vec<(SnapKey, bool)>) { @@ -1199,12 +1344,19 @@ where ) { fail_point!("raft_on_capture_change"); let region_id = self.region_id(); - let msg = + let mut msg = new_read_index_request(region_id, region_epoch.clone(), self.fsm.peer.peer.clone()); + // Allow to capture change even is in flashback state. + // TODO: add a test case for this kind of situation. + if self.region().is_in_flashback { + let mut flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); + flags.insert(WriteBatchFlags::FLASHBACK); + msg.mut_header().set_flags(flags.bits()); + } let apply_router = self.ctx.apply_router.clone(); self.propose_raft_command_internal( msg, - Callback::Read(Box::new(move |resp| { + Callback::read(Box::new(move |resp| { // Return the error if resp.response.get_header().has_error() { cb.invoke_read(resp); @@ -1240,8 +1392,7 @@ where } } SignificantMsg::StoreUnreachable { store_id } => { - if let Some(peer_id) = util::find_peer(self.region(), store_id).map(|p| p.get_id()) - { + if let Some(peer_id) = find_peer(self.region(), store_id).map(|p| p.get_id()) { if self.fsm.peer.is_leader() { self.fsm.peer.raft_group.report_unreachable(peer_id); } else if peer_id == self.fsm.peer.leader_id() { @@ -1260,7 +1411,7 @@ where SignificantMsg::CatchUpLogs(catch_up_logs) => { self.on_catch_up_logs_for_merge(catch_up_logs); } - SignificantMsg::StoreResolved { group_id, .. } => { + SignificantMsg::StoreResolved { group_id, store_id } => { let state = self.ctx.global_replication_state.lock().unwrap(); if state.status().get_mode() != ReplicationMode::DrAutoSync { return; @@ -1269,11 +1420,13 @@ where return; } drop(state); - self.fsm - .peer - .raft_group - .raft - .assign_commit_groups(&[(self.fsm.peer_id(), group_id)]); + if let Some(peer_id) = find_peer(self.region(), store_id).map(|p| p.get_id()) { + self.fsm + .peer + .raft_group + .raft + .assign_commit_groups(&[(peer_id, group_id)]); + } } SignificantMsg::CaptureChange { cmd, @@ -1286,8 +1439,8 @@ where SignificantMsg::RaftLogGcFlushed => { self.on_raft_log_gc_flushed(); } - SignificantMsg::RaftlogFetched { context, res } => { - self.on_raft_log_fetched(context, res); + SignificantMsg::RaftlogFetched(fetched_logs) => { + self.on_raft_log_fetched(fetched_logs.context, fetched_logs.logs); } SignificantMsg::EnterForceLeaderState { syncer, @@ -1309,6 +1462,11 @@ where SignificantMsg::UnsafeRecoveryFillOutReport(syncer) => { self.on_unsafe_recovery_fill_out_report(syncer) } + // for snapshot recovery (safe recovery) + SignificantMsg::SnapshotRecoveryWaitApply(syncer) => { + self.on_snapshot_recovery_wait_apply(syncer) + } + SignificantMsg::CheckPendingAdmin(ch) => self.on_check_pending_admin(ch), } } @@ -1349,8 +1507,9 @@ where ); return; } - // wait two rounds of election timeout to trigger check quorum to step down the leader - // note: check quorum is triggered every `election_timeout` instead of `randomized_election_timeout` + // wait two rounds of election timeout to trigger check quorum to step down the + // leader note: check quorum is triggered every `election_timeout` instead of + // `randomized_election_timeout` Some( self.fsm.peer.raft_group.raft.election_timeout() * 2 - self.fsm.peer.raft_group.raft.election_elapsed, @@ -1430,7 +1589,8 @@ where // When PD issues force leader on two different peer, it may cause // two force leader in same term. self.fsm.peer.raft_group.raft.pre_vote = false; - // trigger vote request to all voters, will check the vote result in `check_force_leader` + // trigger vote request to all voters, will check the vote result in + // `check_force_leader` if let Err(e) = self.fsm.peer.raft_group.campaign() { warn!( "Unsafe recovery, campaign failed"; @@ -1549,7 +1709,8 @@ where self.fsm.peer.raft_group.raft.set_check_quorum(true); self.fsm.peer.raft_group.raft.pre_vote = true; if self.fsm.peer.raft_group.raft.promotable() { - // Do not campaign directly here, otherwise on_role_changed() won't called for follower state + // Do not campaign directly here, otherwise on_role_changed() won't called for + // follower state let _ = self.ctx.router.send( self.region_id(), PeerMsg::CasualMessage(CasualMessage::Campaign), @@ -1671,8 +1832,17 @@ where fn on_raft_log_fetched(&mut self, context: GetEntriesContext, res: Box) { let low = res.low; - // if the peer is not the leader anymore or being destroyed, ignore the result. - if !self.fsm.peer.is_leader() || self.fsm.peer.pending_remove { + // If the peer is not the leader anymore and it's not in entry cache warmup + // state, or it is being destroyed, ignore the result. + if !self.fsm.peer.is_leader() + && self + .fsm + .peer + .get_store() + .entry_cache_warmup_state() + .is_none() + || self.fsm.peer.pending_remove + { self.fsm.peer.mut_store().clean_async_fetch_res(low); return; } @@ -1680,6 +1850,19 @@ where if self.fsm.peer.term() != res.term { // term has changed, the result may be not correct. self.fsm.peer.mut_store().clean_async_fetch_res(low); + } else if self + .fsm + .peer + .get_store() + .entry_cache_warmup_state() + .is_some() + { + if self.fsm.peer.mut_store().maybe_warm_up_entry_cache(*res) { + self.fsm.peer.ack_transfer_leader_msg(false); + self.fsm.has_ready = true; + } + self.fsm.peer.mut_store().clean_async_fetch_res(low); + return; } else { self.fsm .peer @@ -1779,6 +1962,7 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); self.register_report_region_buckets_tick(); + self.register_check_peers_availability_tick(); } if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { @@ -1815,7 +1999,7 @@ where self.register_entry_cache_evict_tick(); } self.ctx.ready_count += 1; - self.ctx.raft_metrics.ready.has_ready_region += 1; + self.ctx.raft_metrics.ready.has_ready_region.inc(); if self.fsm.peer.leader_unreachable { self.fsm.reset_hibernate_state(GroupState::Chaos); @@ -1907,7 +2091,7 @@ where ); if self.fsm.peer.pending_remove { - self.fsm.peer.mut_store().flush_cache_metrics(); + self.fsm.peer.mut_store().flush_entry_cache_metrics(); return; } // When having pending snapshot, if election timeout is met, it can't pass @@ -1930,17 +2114,18 @@ where if self.fsm.hibernate_state.group_state() == GroupState::Idle { // missing_ticks should be less than election timeout ticks otherwise // follower may tick more than an election timeout in chaos state. - // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` - 2 - // - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) - // and the follower's `election_elapsed` in raft-rs is 1. - // After the group state becomes Chaos, the next tick will call `raft_group.tick` - // `missing_tick` + 1 times(default 7). + // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` + // - 2 - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) and the follower's + // `election_elapsed` in raft-rs is 1. + // After the group state becomes Chaos, the next tick will call + // `raft_group.tick` `missing_tick` + 1 times(default 7). // Then the follower's `election_elapsed` will be 1 + `missing_tick` + 1 // (default 1 + 6 + 1 = 8) which is less than the min election timeout. - // The reason is that we don't want let all followers become (pre)candidate if one - // follower may receive a request, then becomes (pre)candidate and sends (pre)vote msg - // to others. As long as the leader can wake up and broadcast heartbeats in one `raft_heartbeat_ticks` - // time(default 2s), no more followers will wake up and sends vote msg again. + // The reason is that we don't want let all followers become (pre)candidate if + // one follower may receive a request, then becomes (pre)candidate and sends + // (pre)vote msg to others. As long as the leader can wake up and broadcast + // heartbeats in one `raft_heartbeat_ticks` time(default 2s), no more followers + // will wake up and sends vote msg again. if self.fsm.missing_ticks + 1 /* for the next tick after the peer isn't Idle */ + self.fsm.peer.raft_group.raft.election_elapsed + self.ctx.cfg.raft_heartbeat_ticks @@ -1974,9 +2159,10 @@ where } self.fsm.peer.post_raft_group_tick(); - self.fsm.peer.mut_store().flush_cache_metrics(); + self.fsm.peer.mut_store().flush_entry_cache_metrics(); - // Keep ticking if there are still pending read requests or this node is within hibernate timeout. + // Keep ticking if there are still pending read requests or this node is within + // hibernate timeout. if res.is_none() /* hibernate_region is false */ || !self.fsm.peer.check_after_tick(self.fsm.hibernate_state.group_state(), res.unwrap()) || (self.fsm.peer.is_leader() && !self.all_agree_to_hibernate()) @@ -1987,12 +2173,6 @@ where return; } - // Keep ticking if there are disk full peers for the Region. - if !self.fsm.peer.disk_full_peers.is_empty() { - self.register_raft_base_tick(); - return; - } - debug!("stop ticking"; "res" => ?res, "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -2012,7 +2192,7 @@ where Some(UnsafeRecoveryState::WaitApply { .. }) => self .fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false), + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ false), Some(UnsafeRecoveryState::DemoteFailedVoters { syncer, failed_voters, @@ -2084,6 +2264,9 @@ where "peer_id" => self.fsm.peer_id(), "res" => ?res, ); + if self.fsm.peer.wait_data { + return; + } self.on_ready_result(&mut res.exec_res, &res.metrics); if self.fsm.stopped { return; @@ -2091,17 +2274,12 @@ where let applied_index = res.apply_state.applied_index; let buckets = self.fsm.peer.region_buckets.as_mut(); if let (Some(delta), Some(buckets)) = (res.bucket_stat, buckets) { - merge_bucket_stats( - &buckets.meta.keys, - &mut buckets.stats, - &delta.meta.keys, - &delta.stats, - ); + buckets.merge(&delta); } self.fsm.has_ready |= self.fsm.peer.post_apply( self.ctx, res.apply_state, - res.applied_index_term, + res.applied_term, &res.metrics, ); // After applying, several metrics are updated, report it to pd to @@ -2141,6 +2319,12 @@ where if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); } + + if self.fsm.peer.snapshot_recovery_state.is_some() { + self.fsm + .peer + .snapshot_recovery_maybe_finish_wait_apply(false); + } } fn retry_pending_prepare_merge(&mut self, applied_index: u64) { @@ -2180,7 +2364,7 @@ where "peer_id" => self.fsm.peer_id(), "err" => ?e, ); - self.ctx.raft_metrics.propose.unsafe_read_index += 1; + self.ctx.raft_metrics.propose.unsafe_read_index.inc(); return; } @@ -2194,7 +2378,7 @@ where cmd.mut_header().set_read_quorum(true); self.propose_raft_command_internal( cmd, - Callback::Read(Box::new(|_| ())), + Callback::read(Box::new(|_| ())), DiskFullOpt::AllowedOnAlmostFull, ); } @@ -2283,7 +2467,18 @@ where "skip {:?} because of disk full", msg_type; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() ); - self.ctx.raft_metrics.message_dropped.disk_full += 1; + self.ctx.raft_metrics.message_dropped.disk_full.inc(); + return Ok(()); + } + + if MessageType::MsgAppend == msg_type + && self.fsm.peer.wait_data + && self.fsm.peer.should_reject_msgappend + { + debug!("skip {:?} because of non-witness waiting data", msg_type; + "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() + ); + self.ctx.raft_metrics.message_dropped.non_witness.inc(); return Ok(()); } @@ -2319,12 +2514,14 @@ where // TODO: spin off the I/O code (delete_snapshot) let regions_to_destroy = match self.check_snapshot(&msg)? { Either::Left(key) => { - // If the snapshot file is not used again, then it's OK to - // delete them here. If the snapshot file will be reused when - // receiving, then it will fail to pass the check again, so - // missing snapshot files should not be noticed. - let s = self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; - self.ctx.snap_mgr.delete_snapshot(&key, s.as_ref(), false); + if let Some(key) = key { + // If the snapshot file is not used again, then it's OK to + // delete them here. If the snapshot file will be reused when + // receiving, then it will fail to pass the check again, so + // missing snapshot files should not be noticed. + let s = self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + self.ctx.snap_mgr.delete_snapshot(&key, s.as_ref(), false); + } return Ok(()); } Either::Right(v) => v, @@ -2353,7 +2550,7 @@ where && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.fsm.peer_id()) { - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(()); } self.fsm.peer.step(self.ctx, msg.take_message()) @@ -2369,10 +2566,11 @@ where .retain(|r| self.fsm.region_id() != r.get_id()); } else { // This snapshot may be accepted by raft-rs. - // If it's rejected by raft-rs, the snapshot region in `pending_snapshot_regions` - // will be removed together with the latest snapshot region after applying that snapshot. - // But if `regions_to_destroy` is not empty, the pending snapshot must be this msg's snapshot - // because this kind of snapshot is exclusive. + // If it's rejected by raft-rs, the snapshot region in + // `pending_snapshot_regions` will be removed together with the latest snapshot + // region after applying that snapshot. + // But if `regions_to_destroy` is not empty, the pending snapshot must be this + // msg's snapshot because this kind of snapshot is exclusive. self.destroy_regions_for_snapshot(regions_to_destroy); } } @@ -2420,6 +2618,7 @@ where fn on_hibernate_request(&mut self, from: &metapb::Peer) { if !self.ctx.cfg.hibernate_regions || self.fsm.peer.has_uncommitted_log() + || self.fsm.peer.wait_data || from.get_id() != self.fsm.peer.leader_id() { // Ignore the message means rejecting implicitly. @@ -2449,11 +2648,133 @@ where self.fsm.hibernate_state.count_vote(from.get_id()); } + fn on_availability_response(&mut self, from: &metapb::Peer, msg: &ExtraMessage) { + if !self.fsm.peer.is_leader() { + return; + } + if !msg.wait_data { + let original_remains_nr = self.fsm.peer.wait_data_peers.len(); + self.fsm + .peer + .wait_data_peers + .retain(|id| *id != from.get_id()); + debug!( + "receive peer ready info"; + "peer_id" => self.fsm.peer.peer.get_id(), + ); + if original_remains_nr != self.fsm.peer.wait_data_peers.len() { + info!( + "notify pd with change peer region"; + "region_id" => self.fsm.region_id(), + "peer_id" => from.get_id(), + "region" => ?self.fsm.peer.region(), + ); + self.fsm.peer.heartbeat_pd(self.ctx); + } + return; + } + self.register_check_peers_availability_tick(); + } + + fn on_availability_request(&mut self, from: &metapb::Peer) { + if self.fsm.peer.is_leader() { + return; + } + let mut resp = ExtraMessage::default(); + resp.set_type(ExtraMessageType::MsgAvailabilityResponse); + resp.wait_data = self.fsm.peer.wait_data; + self.fsm + .peer + .send_extra_message(resp, &mut self.ctx.trans, from); + debug!( + "peer responses availability info to leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + "leader_id" => from.id, + ); + } + + fn on_voter_replicated_index_request(&mut self, from: &metapb::Peer) { + if !self.fsm.peer.is_leader() { + return; + } + let mut voter_replicated_idx = self.fsm.peer.get_store().last_index(); + for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if voter_replicated_idx > p.matched && !is_learner(peer) { + voter_replicated_idx = p.matched; + } + } + let first_index = self.fsm.peer.get_store().first_index(); + if voter_replicated_idx > first_index { + voter_replicated_idx = first_index; + } + let mut resp = ExtraMessage::default(); + resp.set_type(ExtraMessageType::MsgVoterReplicatedIndexResponse); + resp.index = voter_replicated_idx; + self.fsm + .peer + .send_extra_message(resp, &mut self.ctx.trans, from); + debug!( + "leader responses voter_replicated_index to witness"; + "region_id" => self.region().get_id(), + "witness_id" => from.id, + "leader_id" => self.fsm.peer.peer.get_id(), + "voter_replicated_index" => voter_replicated_idx, + ); + } + + fn on_voter_replicated_index_response(&mut self, msg: &ExtraMessage) { + if self.fsm.peer.is_leader() || !self.fsm.peer.is_witness() { + return; + } + let voter_replicated_index = msg.index; + if let Ok(voter_replicated_term) = self.fsm.peer.get_store().term(voter_replicated_index) { + self.ctx.apply_router.schedule_task( + self.region_id(), + ApplyTask::CheckCompact { + region_id: self.region_id(), + voter_replicated_index, + voter_replicated_term, + }, + ) + } + } + + // In v1, gc_peer_request is handled to be compatible with v2. + // Note: it needs to be consistent with Peer::on_gc_peer_request in v2. + fn on_gc_peer_request(&mut self, msg: RaftMessage) { + let extra_msg = msg.get_extra_msg(); + + if !extra_msg.has_check_gc_peer() || extra_msg.get_index() == 0 { + // Corrupted message. + return; + } + if self.fsm.peer.get_store().applied_index() < extra_msg.get_index() { + // Merge not finish. + return; + } + + forward_destroy_to_source_peer(&msg, |m| { + let _ = self.ctx.router.send_raft_message(m); + }); + } + fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { if self.fsm.hibernate_state.group_state() == GroupState::Idle { - self.reset_raft_tick(GroupState::Ordered); + if msg.get_extra_msg().forcely_awaken { + // Forcely awaken this region by manually setting this GroupState + // into Chaos to trigger a new voting in this RaftGroup. + self.reset_raft_tick(if !self.fsm.peer.is_leader() { + GroupState::Chaos + } else { + GroupState::Ordered + }); + } else { + self.reset_raft_tick(GroupState::Ordered); + } } if msg.get_extra_msg().get_type() == ExtraMessageType::MsgRegionWakeUp && self.fsm.peer.is_leader() @@ -2479,6 +2800,30 @@ where ExtraMessageType::MsgHibernateResponse => { self.on_hibernate_response(msg.get_from_peer()); } + ExtraMessageType::MsgRejectRaftLogCausedByMemoryUsage => { + unimplemented!() + } + ExtraMessageType::MsgAvailabilityRequest => { + self.on_availability_request(msg.get_from_peer()); + } + ExtraMessageType::MsgAvailabilityResponse => { + self.on_availability_response(msg.get_from_peer(), msg.get_extra_msg()); + } + ExtraMessageType::MsgVoterReplicatedIndexRequest => { + self.on_voter_replicated_index_request(msg.get_from_peer()); + } + ExtraMessageType::MsgVoterReplicatedIndexResponse => { + self.on_voter_replicated_index_response(msg.get_extra_msg()); + } + ExtraMessageType::MsgGcPeerRequest => { + // To make learner (e.g. tiflash engine) compatiable with raftstore v2, + // it needs to response GcPeerResponse. + if self.ctx.cfg.enable_v2_compatible_learner { + self.on_gc_peer_request(msg); + } + } + // It's v2 only message and ignore does no harm. + ExtraMessageType::MsgGcPeerResponse | ExtraMessageType::MsgFlushMemtable => (), } } @@ -2505,7 +2850,11 @@ where "to_store_id" => to.get_store_id(), "my_store_id" => self.store_id(), ); - self.ctx.raft_metrics.message_dropped.mismatch_store_id += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_store_id + .inc(); return false; } @@ -2514,7 +2863,11 @@ where "missing epoch in raft message, ignore it"; "region_id" => region_id, ); - self.ctx.raft_metrics.message_dropped.mismatch_region_epoch += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_region_epoch + .inc(); return false; } @@ -2529,26 +2882,29 @@ where let from_store_id = msg.get_from_peer().get_store_id(); // Let's consider following cases with three nodes [1, 2, 3] and 1 is leader: - // a. 1 removes 2, 2 may still send MsgAppendResponse to 1. + // - 1 removes 2, 2 may still send MsgAppendResponse to 1. // We should ignore this stale message and let 2 remove itself after // applying the ConfChange log. - // b. 2 is isolated, 1 removes 2. When 2 rejoins the cluster, 2 will - // send stale MsgRequestVote to 1 and 3, at this time, we should tell 2 to gc itself. - // c. 2 is isolated but can communicate with 3. 1 removes 3. + // - 2 is isolated, 1 removes 2. When 2 rejoins the cluster, 2 will + // send stale MsgRequestVote to 1 and 3, at this time, we should tell 2 to gc + // itself. + // - 2 is isolated but can communicate with 3. 1 removes 3. // 2 will send stale MsgRequestVote to 3, 3 should ignore this message. - // d. 2 is isolated but can communicate with 3. 1 removes 2, then adds 4, remove 3. + // - 2 is isolated but can communicate with 3. 1 removes 2, then adds 4, remove + // 3. // 2 will send stale MsgRequestVote to 3, 3 should tell 2 to gc itself. - // e. 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader. + // - 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader. // After 2 rejoins the cluster, 2 may send stale MsgRequestVote to 1 and 3, // 1 and 3 will ignore this message. Later 4 will send messages to 2 and 2 will // rejoin the raft group again. - // f. 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader, and 4 removes 2. + // - 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader, and 4 + // removes 2. // unlike case e, 2 will be stale forever. - // TODO: for case f, if 2 is stale for a long time, 2 will communicate with pd and pd will - // tell 2 is stale, so 2 can remove itself. + // TODO: for case f, if 2 is stale for a long time, 2 will communicate with pd + // and pd will tell 2 is stale, so 2 can remove itself. let self_epoch = self.fsm.peer.region().get_region_epoch(); if util::is_epoch_stale(from_epoch, self_epoch) - && util::find_peer(self.fsm.peer.region(), from_store_id).is_none() + && find_peer(self.fsm.peer.region(), from_store_id).is_none() { self.ctx.handle_stale_msg(msg, self_epoch.clone(), None); return true; @@ -2563,7 +2919,7 @@ where "peer_id" => self.fsm.peer_id(), "target_peer" => ?target, ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); true } cmp::Ordering::Greater => { @@ -2591,7 +2947,7 @@ where } } } - None => self.ctx.raft_metrics.message_dropped.applying_snap += 1, + None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), } true } @@ -2613,11 +2969,11 @@ where "merge_target" => ?merge_target, ); - // When receiving message that has a merge target, it indicates that the source peer on this - // store is stale, the peers on other stores are already merged. The epoch in merge target - // is the state of target peer at the time when source peer is merged. So here we record the - // merge target epoch version to let the target peer on this store to decide whether to - // destroy the source peer. + // When receiving message that has a merge target, it indicates that the source + // peer on this store is stale, the peers on other stores are already merged. + // The epoch in merge target is the state of target peer at the time when source + // peer is merged. So here we record the merge target epoch version to let the + // target peer on this store to decide whether to destroy the source peer. let mut meta = self.ctx.store_meta.lock().unwrap(); meta.targets_map.insert(self.region_id(), target_region_id); let v = meta @@ -2628,8 +2984,8 @@ where no_range_merge_target.clear_start_key(); no_range_merge_target.clear_end_key(); if let Some(pre_merge_target) = v.insert(self.region_id(), no_range_merge_target) { - // Merge target epoch records the version of target region when source region is merged. - // So it must be same no matter when receiving merge target. + // Merge target epoch records the version of target region when source region is + // merged. So it must be same no matter when receiving merge target. if pre_merge_target.get_region_epoch().get_version() != merge_target.get_region_epoch().get_version() { @@ -2642,7 +2998,8 @@ where } if let Some(r) = meta.regions.get(&target_region_id) { - // In the case that the source peer's range isn't overlapped with target's anymore: + // In the case that the source peer's range isn't overlapped with target's + // anymore: // | region 2 | region 3 | region 1 | // || merge 3 into 2 // \/ @@ -2656,8 +3013,8 @@ where // so the new target peer can't find the source peer. // e.g. new region 2 is overlapped with region 1 // - // If that, source peer still need to decide whether to destroy itself. When the target - // peer has already moved on, source peer can destroy itself. + // If that, source peer still need to decide whether to destroy itself. When the + // target peer has already moved on, source peer can destroy itself. if util::is_epoch_stale(merge_target.get_region_epoch(), r.get_region_epoch()) { return Ok(true); } @@ -2666,8 +3023,8 @@ where drop(meta); // All of the target peers must exist before merging which is guaranteed by PD. - // Now the target peer is not in region map, so if everything is ok, the merge target - // region should be staler than the local target region + // Now the target peer is not in region map, so if everything is ok, the merge + // target region should be staler than the local target region if self.is_merge_target_region_stale(merge_target)? { Ok(true) } else { @@ -2695,7 +3052,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return; } // TODO: ask pd to guarantee we are stale now. @@ -2707,26 +3064,66 @@ where ); // Destroy peer in next round in order to apply more committed entries if any. - // It depends on the implementation that msgs which are handled in this round have already fetched. + // It depends on the implementation that msgs which are handled in this round + // have already fetched. let _ = self .ctx .router .force_send(self.fsm.region_id(), PeerMsg::Destroy(self.fsm.peer_id())); } - // Returns `Vec<(u64, bool)>` indicated (source_region_id, merge_to_this_peer) if the `msg` - // doesn't contain a snapshot or this snapshot doesn't conflict with any other snapshots or regions. - // Otherwise a `SnapKey` is returned. - fn check_snapshot(&mut self, msg: &RaftMessage) -> Result>> { + // Returns `Vec<(u64, bool)>` indicated (source_region_id, merge_to_this_peer) + // if the `msg` doesn't contain a snapshot or this snapshot doesn't conflict + // with any other snapshots or regions. Otherwise a `SnapKey` is returned. + fn check_snapshot( + &mut self, + msg: &RaftMessage, + ) -> Result, Vec<(u64, bool)>>> { if !msg.get_message().has_snapshot() { return Ok(Either::Right(vec![])); } let region_id = msg.get_region_id(); let snap = msg.get_message().get_snapshot(); - let key = SnapKey::from_region_snap(region_id, snap); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; + + let key = if !snap_data.get_meta().get_for_witness() { + // Check if snapshot file exists. + // No need to get snapshot for witness, as witness's empty snapshot bypass + // snapshot manager. + let key = SnapKey::from_region_snap(region_id, snap); + self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + Some(key) + } else { + None + }; + + // If the index of snapshot is not newer than peer's apply index, it + // is possibly because there is witness -> non-witness switch, and the peer + // requests snapshot from leader but leader doesn't applies the switch yet. + // In that case, the snapshot is a witness snapshot whereas non-witness snapshot + // is expected. + if snap.get_metadata().get_index() < self.fsm.peer.get_store().applied_index() + && snap_data.get_meta().get_for_witness() != self.fsm.peer.is_witness() + { + error!( + "mismatch witness snapshot"; + "region_id" => region_id, + "peer_id" => self.fsm.peer_id(), + "for_witness" => snap_data.get_meta().get_for_witness(), + "is_witness" => self.fsm.peer.is_witness(), + "index" => snap.get_metadata().get_index(), + "applied_index" => self.fsm.peer.get_store().applied_index(), + ); + self.ctx + .raft_metrics + .message_dropped + .mismatch_witness_snapshot + .inc(); + return Ok(Either::Left(key)); + } + let snap_region = snap_data.take_region(); let peer_id = msg.get_to_peer().get_id(); let snap_enc_start_key = enc_start_key(&snap_region); @@ -2764,7 +3161,7 @@ where "snap" => ?snap_region, "to_peer" => ?msg.get_to_peer(), ); - self.ctx.raft_metrics.message_dropped.region_no_peer += 1; + self.ctx.raft_metrics.message_dropped.region_no_peer.inc(); return Ok(Either::Left(key)); } @@ -2776,7 +3173,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(Either::Left(key)); } else { panic!( @@ -2810,20 +3207,21 @@ where "region" => ?region, "snap" => ?snap_region, ); - self.ctx.raft_metrics.message_dropped.region_overlap += 1; + self.ctx.raft_metrics.message_dropped.region_overlap.inc(); return Ok(Either::Left(key)); } } let mut is_overlapped = false; let mut regions_to_destroy = vec![]; - // In some extreme cases, it may cause source peer destroyed improperly so that a later - // CommitMerge may panic because source is already destroyed, so just drop the message: - // 1. A new snapshot is received whereas a snapshot is still in applying, and the snapshot - // under applying is generated before merge and the new snapshot is generated after merge. - // After the applying snapshot is finished, the log may able to catch up and so a - // CommitMerge will be applied. - // 2. There is a CommitMerge pending in apply thread. + // In some extreme cases, it may cause source peer destroyed improperly so that + // a later CommitMerge may panic because source is already destroyed, so just + // drop the message: + // - A new snapshot is received whereas a snapshot is still in applying, and the + // snapshot under applying is generated before merge and the new snapshot is + // generated after merge. After the applying snapshot is finished, the log may + // able to catch up and so a CommitMerge will be applied. + // - There is a CommitMerge pending in apply thread. let ready = !self.fsm.peer.is_handling_snapshot() && !self.fsm.peer.has_pending_snapshot() // It must be ensured that all logs have been applied. @@ -2852,9 +3250,9 @@ where snap_region.get_region_epoch().to_owned(), ); if ready && can_destroy { - // The snapshot that we decide to whether destroy peer based on must can be applied. - // So here not to destroy peer immediately, or the snapshot maybe dropped in later - // check but the peer is already destroyed. + // The snapshot that we decide to whether destroy peer based on must can be + // applied. So here not to destroy peer immediately, or the snapshot maybe + // dropped in later check but the peer is already destroyed. regions_to_destroy.push((exist_region.get_id(), merge_to_this_peer)); continue; } @@ -2872,25 +3270,24 @@ where } } if is_overlapped { - self.ctx.raft_metrics.message_dropped.region_overlap += 1; + self.ctx.raft_metrics.message_dropped.region_overlap.inc(); return Ok(Either::Left(key)); } - // Check if snapshot file exists. - self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; - // WARNING: The checking code must be above this line. // Now all checking passed. if self.fsm.peer.local_first_replicate && !self.fsm.peer.is_initialized() { - // If the peer is not initialized and passes the snapshot range check, `is_splitting` flag must - // be false. - // 1. If `is_splitting` is set to true, then the uninitialized peer is created before split is applied - // and the peer id is the same as split one. So there should be no initialized peer before. - // 2. If the peer is also created by splitting, then the snapshot range is not overlapped with - // parent peer. It means leader has applied merge and split at least one time. However, - // the prerequisite of merge includes the initialization of all target peers and source peers, - // which is conflict with 1. + // If the peer is not initialized and passes the snapshot range check, + // `is_splitting` flag must be false. + // - If `is_splitting` is set to true, then the uninitialized peer is created + // before split is applied and the peer id is the same as split one. So there + // should be no initialized peer before. + // - If the peer is also created by splitting, then the snapshot range is not + // overlapped with parent peer. It means leader has applied merge and split at + // least one time. However, the prerequisite of merge includes the + // initialization of all target peers and source peers, which is conflict with + // 1. let pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); let status = pending_create_peers.get(®ion_id).cloned(); if status != Some((self.fsm.peer_id(), false)) { @@ -2939,8 +3336,8 @@ where } else { MergeResultKind::Stale }; - // Use `unwrap` is ok because the StoreMeta lock is held and these source peers still - // exist in regions and region_ranges map. + // Use `unwrap` is ok because the StoreMeta lock is held and these source peers + // still exist in regions and region_ranges map. // It depends on the implementation of `destroy_peer`. self.ctx .router @@ -2984,9 +3381,7 @@ where ); } None => { - if self.fsm.batch_req_builder.request.is_some() { - self.propose_batch_raft_command(true); - } + self.propose_pending_batch_raft_command(); if self.propose_locks_before_transfer_leader(msg) { // If some pessimistic locks are just proposed, we propose another // TransferLeader command instead of transferring leader immediately. @@ -3017,18 +3412,22 @@ where } } } - } else { - self.fsm - .peer - .execute_transfer_leader(self.ctx, msg.get_from(), peer_disk_usage, false); + } else if !self + .fsm + .peer + .maybe_reject_transfer_leader_msg(self.ctx, msg, peer_disk_usage) + && self.fsm.peer.pre_ack_transfer_leader_msg(self.ctx, msg) + { + self.fsm.peer.ack_transfer_leader_msg(false); } } - // Returns whether we should propose another TransferLeader command. This is for: - // 1. Considering the amount of pessimistic locks can be big, it can reduce - // unavailable time caused by waiting for the transferree catching up logs. - // 2. Make transferring leader strictly after write commands that executes - // before proposing the locks, preventing unexpected lock loss. + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. fn propose_locks_before_transfer_leader(&mut self, msg: &eraftpb::Message) -> bool { // 1. Disable in-memory pessimistic locks. @@ -3041,20 +3440,22 @@ where // in the TransferringLeader status, we can safely initiate transferring leader // now. // If it's not in TransferringLeader status now, it is probably because several - // ticks have passed after proposing the locks in the last time and we reactivate - // the memory locks. Then, we should propose the locks again. + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX && pessimistic_locks.status == LocksStatus::TransferringLeader { return false; } - // If it is not writable, it's probably because it's a retried TransferLeader and the locks - // have been proposed. But we still need to return true to propose another TransferLeader - // command. Otherwise, some write requests that have marked some locks as deleted will fail - // because raft rejects more proposals. - // It is OK to return true here if it's in other states like MergingRegion or NotLeader. - // In those cases, the locks will fail to propose and nothing will happen. + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. if !pessimistic_locks.is_writable() { return true; } @@ -3066,11 +3467,12 @@ where if pessimistic_locks.is_empty() { return false; } - // FIXME: Raft command has size limit. Either limit the total size of pessimistic locks - // in a region, or split commands here. + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. let mut cmd = RaftCmdRequest::default(); { - // Downgrade to a read guard, do not block readers in the scheduler as far as possible. + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); fail_point!("invalidate_locks_before_transfer_leader"); for (key, (lock, deleted)) in &*pessimistic_locks { @@ -3088,9 +3490,10 @@ where } } if cmd.get_requests().is_empty() { - // If the map is not empty but all locks are deleted, it is possible that a write - // command has just marked locks deleted but not proposed yet. It might cause - // that command to fail if we skip proposing the extra TransferLeader command here. + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. return true; } cmd.mut_header().set_region_id(self.fsm.region_id()); @@ -3116,7 +3519,8 @@ where } } - /// Check if destroy can be executed immediately. If it can't, the reason is returned. + /// Check if destroy can be executed immediately. If it can't, the reason is + /// returned. fn maybe_delay_destroy(&mut self) -> Option { if self.fsm.peer.has_unpersisted_ready() { assert!(self.ctx.sync_write_worker.is_none()); @@ -3126,6 +3530,14 @@ where return Some(DelayReason::UnPersistedReady); } + let is_initialized = self.fsm.peer.is_initialized(); + if !is_initialized { + // If the peer is uninitialized, then it can't receive any logs from leader. So + // no need to gc. If there was a peer with same region id on the store, and it + // had logs written, then it must be initialized, hence its log should be gc + // either before it's destroyed or during node restarts. + self.fsm.logs_gc_flushed = true; + } if !self.fsm.logs_gc_flushed { let start_index = self.fsm.peer.last_compacted_idx; let mut end_index = start_index; @@ -3242,7 +3654,13 @@ where if self.fsm.peer.unsafe_recovery_state.is_some() { self.fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ true); + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ true); + } + + if self.fsm.peer.snapshot_recovery_state.is_some() { + self.fsm + .peer + .snapshot_recovery_maybe_finish_wait_apply(/* force= */ true); } let mut meta = self.ctx.store_meta.lock().unwrap(); @@ -3286,7 +3704,7 @@ where let is_initialized = self.fsm.peer.is_initialized(); if let Err(e) = self.fsm.peer.destroy( &self.ctx.engines, - &mut self.ctx.perf_context, + &mut self.ctx.raft_perf_context, merged_by_target, &self.ctx.pending_create_peers, ) { @@ -3298,7 +3716,8 @@ where } // Some places use `force_send().unwrap()` if the StoreMeta lock is held. - // So in here, it's necessary to held the StoreMeta lock when closing the router. + // So in here, it's necessary to held the StoreMeta lock when closing the + // router. self.ctx.router.close(region_id); self.fsm.stop(); @@ -3341,8 +3760,10 @@ where .get_mut(&target) .unwrap() .remove(®ion_id); - // When the target doesn't exist(add peer but the store is isolated), source peer decide to destroy by itself. - // Without target, the `pending_merge_targets` for target won't be removed, so here source peer help target to clear. + // When the target doesn't exist(add peer but the store is isolated), source + // peer decide to destroy by itself. Without target, the + // `pending_merge_targets` for target won't be removed, so here source peer help + // target to clear. if meta.regions.get(&target).is_none() && meta.pending_merge_targets.get(&target).unwrap().is_empty() { @@ -3391,12 +3812,18 @@ where _ => unreachable!(), } } else { - // Please take a look at test case test_redundant_conf_change_by_snapshot. + // Please take a look at test case + // test_redundant_conf_change_by_snapshot. } self.update_region(cp.region); fail_point!("change_peer_after_update_region"); + fail_point!( + "change_peer_after_update_region_store_3", + self.store_id() == 3, + |_| panic!("should not use return") + ); let now = Instant::now(); let (mut remove_self, mut need_ping) = (false, false); @@ -3441,6 +3868,7 @@ where .peer .peers_start_pending_time .retain(|&(p, _)| p != peer_id); + self.fsm.peer.wait_data_peers.retain(|id| *id != peer_id); } self.fsm.peer.remove_peer_from_cache(peer_id); // We only care remove itself now. @@ -3506,9 +3934,10 @@ where // Most of these functions are only called when the peer is a leader. // (it's pretty reasonable because progress is used to track others' status) // The only exception is `Raft::restore` at the time of writing, which is ok - // because the raft msgs(including snapshot) don't be handled when `pending_remove` - // is true(it will be set in `destroy_peer`). - // TODO: totally avoid calling these raft-rs functions when `pending_remove` is true. + // because the raft msgs(including snapshot) don't be handled when + // `pending_remove` is true(it will be set in `destroy_peer`). + // TODO: totally avoid calling these raft-rs functions when `pending_remove` is + // true. self.fsm .peer .raft_group @@ -3532,6 +3961,14 @@ where } fn on_ready_compact_log(&mut self, first_index: u64, state: RaftTruncatedState) { + // Since this peer may be warming up the entry cache, log compaction should be + // temporarily skipped. Otherwise, the warmup task may fail. + if let Some(state) = self.fsm.peer.mut_store().entry_cache_warmup_state_mut() { + if !state.check_stale(MAX_WARMED_UP_CACHE_KEEP_TIME) { + return; + } + } + let total_cnt = self.fsm.peer.last_applying_idx - first_index; // the size of current CompactLog command can be ignored. let remain_cnt = self.fsm.peer.last_applying_idx - state.get_index() - 1; @@ -3540,7 +3977,10 @@ where let compact_to = state.get_index() + 1; self.fsm.peer.schedule_raftlog_gc(self.ctx, compact_to); self.fsm.peer.last_compacted_idx = compact_to; - self.fsm.peer.mut_store().compact_to(compact_to); + self.fsm.peer.mut_store().on_compact_raftlog(compact_to); + if self.fsm.peer.is_witness() { + self.fsm.peer.last_compacted_time = Instant::now(); + } } fn on_ready_split_region( @@ -3553,9 +3993,10 @@ where let region_id = derived.get_id(); - // Group in-memory pessimistic locks in the original region into new regions. The locks of - // new regions will be put into the corresponding new regions later. And the locks belonging - // to the old region will stay in the original map. + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. let region_locks = { let mut pessimistic_locks = self.fsm.peer.txn_ext.pessimistic_locks.write(); info!("moving {} locks to new regions", pessimistic_locks.len(); "region_id" => region_id); @@ -3681,6 +4122,7 @@ where self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), &new_region, + false, ) { Ok((sender, new_peer)) => (sender, new_peer), Err(e) => { @@ -3690,7 +4132,7 @@ where } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); - new_peer.peer.init_replication_mode(&mut *replication_state); + new_peer.peer.init_replication_mode(&mut replication_state); drop(replication_state); let meta_peer = new_peer.peer.peer.clone(); @@ -3703,8 +4145,13 @@ where // New peer derive write flow from parent region, // this will be used by balance write flow. new_peer.peer.peer_stat = self.fsm.peer.peer_stat.clone(); - new_peer.peer.last_compacted_idx = - new_peer.apply_state().get_truncated_state().get_index() + 1; + new_peer.peer.last_compacted_idx = new_peer + .peer + .get_store() + .apply_state() + .get_truncated_state() + .get_index() + + 1; let campaigned = new_peer.peer.maybe_campaign(is_leader); new_peer.has_ready |= campaigned; @@ -3712,8 +4159,8 @@ where new_peer.peer.approximate_size = estimated_size; new_peer.peer.approximate_keys = estimated_keys; *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; - // The new peer is likely to become leader, send a heartbeat immediately to reduce - // client query miss. + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. new_peer.peer.heartbeat_pd(self.ctx); } @@ -3765,11 +4212,12 @@ where /// Check if merge target region is staler than the local one in kv engine. /// It should be called when target region is not in region map in memory. - /// If everything is ok, the answer should always be true because PD should ensure all target peers exist. - /// So if not, error log will be printed and return false. + /// If everything is ok, the answer should always be true because PD should + /// ensure all target peers exist. So if not, error log will be printed + /// and return false. fn is_merge_target_region_stale(&self, target_region: &metapb::Region) -> Result { let target_region_id = target_region.get_id(); - let target_peer_id = util::find_peer(target_region, self.ctx.store_id()) + let target_peer_id = find_peer(target_region, self.ctx.store_id()) .unwrap() .get_id(); @@ -3785,10 +4233,11 @@ where return Ok(true); } // The local target region epoch is staler than target region's. - // In the case where the peer is destroyed by receiving gc msg rather than applying conf change, - // the epoch may staler but it's legal, so check peer id to assure that. + // In the case where the peer is destroyed by receiving gc msg rather than + // applying conf change, the epoch may staler but it's legal, so check peer id + // to assure that. if let Some(local_target_peer_id) = - util::find_peer(target_state.get_region(), self.ctx.store_id()).map(|r| r.get_id()) + find_peer(target_state.get_region(), self.ctx.store_id()).map(|r| r.get_id()) { match local_target_peer_id.cmp(&target_peer_id) { cmp::Ordering::Equal => { @@ -3810,8 +4259,8 @@ where // There is a new peer and it's destroyed without being initialised. return Ok(true); } - // The local target peer id is greater than the one in target region, but its epoch - // is staler than target_region's. That is contradictory. + // The local target peer id is greater than the one in target region, but + // its epoch is staler than target_region's. That is contradictory. panic!("{} local target peer id {} is greater than the one in target region {}, but its epoch is staler, local target region {:?}, target region {:?}", self.fsm.peer.tag, local_target_peer_id, target_peer_id, target_state.get_region(), target_region); } @@ -3827,7 +4276,8 @@ where } } } else { - // Can't get local target peer id probably because this target peer is removed by applying conf change + // Can't get local target peer id probably because this target peer is removed + // by applying conf change error!( "the local target peer does not exist in target region state"; "target_region" => ?target_region, @@ -3945,7 +4395,7 @@ where } }; - let sibling_peer = util::find_peer(sibling_region, self.store_id()).unwrap(); + let sibling_peer = find_peer(sibling_region, self.store_id()).unwrap(); let mut request = new_admin_request(sibling_region.get_id(), sibling_peer.clone()); request .mut_header() @@ -3960,9 +4410,10 @@ where request.set_admin_request(admin); (request, target_id) }; - // Please note that, here assumes that the unit of network isolation is store rather than - // peer. So a quorum stores of source region should also be the quorum stores of target - // region. Otherwise we need to enable proposal forwarding. + // Please note that, here assumes that the unit of network isolation is store + // rather than peer. So a quorum stores of source region should also be the + // quorum stores of target region. Otherwise we need to enable proposal + // forwarding. self.ctx .router .force_send( @@ -4184,13 +4635,14 @@ where d.mark_pending_remove(); } - // After the region commit merged, the region's key range is extended and the region's `safe_ts` - // should reset to `min(source_safe_ts, target_safe_ts)` + // After the region commit merged, the region's key range is extended and the + // region's `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)` let source_read_progress = meta.region_read_progress.remove(&source.get_id()).unwrap(); - self.fsm - .peer - .read_progress - .merge_safe_ts(source_read_progress.safe_ts(), merge_index); + self.fsm.peer.read_progress.merge_safe_ts( + source_read_progress.safe_ts(), + merge_index, + &self.ctx.coprocessor_host, + ); // If a follower merges into a leader, a more recent read may happen // on the leader of the follower. So max ts should be updated after @@ -4202,8 +4654,8 @@ where drop(meta); // make approximate size and keys updated in time. - // the reason why follower need to update is that there is a issue that after merge - // and then transfer leader, the new leader may have stale size and keys. + // the reason why follower need to update is that there is a issue that after + // merge and then transfer leader, the new leader may have stale size and keys. self.fsm.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; self.fsm.peer.reset_region_buckets(); if self.fsm.peer.is_leader() { @@ -4235,9 +4687,9 @@ where /// Handle rollbacking Merge result. /// - /// If commit is 0, it means that Merge is rollbacked by a snapshot; otherwise - /// it's rollbacked by a proposal, and its value should be equal to the commit - /// index of previous PrepareMerge. + /// If commit is 0, it means that Merge is rollbacked by a snapshot; + /// otherwise it's rollbacked by a proposal, and its value should be + /// equal to the commit index of previous PrepareMerge. fn on_ready_rollback_merge(&mut self, commit: u64, region: Option) { let pending_commit = self .fsm @@ -4308,9 +4760,9 @@ where ); } // Because of the checking before proposing `PrepareMerge`, which is - // no `CompactLog` proposal between the smallest commit index and the latest index. - // If the merge succeed, all source peers are impossible in apply snapshot state - // and must be initialized. + // no `CompactLog` proposal between the smallest commit index and the latest + // index. If the merge succeed, all source peers are impossible in apply + // snapshot state and must be initialized. { let meta = self.ctx.store_meta.lock().unwrap(); if meta.atomic_snap_regions.contains_key(&self.region_id()) { @@ -4380,9 +4832,9 @@ where "merge_state" => ?self.fsm.peer.pending_merge_state, ); // Because of the checking before proposing `PrepareMerge`, which is - // no `CompactLog` proposal between the smallest commit index and the latest index. - // If the merge succeed, all source peers are impossible in apply snapshot state - // and must be initialized. + // no `CompactLog` proposal between the smallest commit index and the latest + // index. If the merge succeed, all source peers are impossible in apply + // snapshot state and must be initialized. // So `maybe_destroy` must succeed here. let job = self.fsm.peer.maybe_destroy(self.ctx).unwrap(); self.handle_destroy_peer(job); @@ -4422,8 +4874,9 @@ where ); // Remove this region's snapshot region from the `pending_snapshot_regions` - // The `pending_snapshot_regions` is only used to occupy the key range, so if this - // peer is added to `region_ranges`, it can be remove from `pending_snapshot_regions` + // The `pending_snapshot_regions` is only used to occupy the key range, so if + // this peer is added to `region_ranges`, it can be remove from + // `pending_snapshot_regions` meta.pending_snapshot_regions .retain(|r| self.fsm.region_id() != r.get_id()); @@ -4466,7 +4919,8 @@ where } } else if self.fsm.peer.local_first_replicate { // This peer is uninitialized previously. - // More accurately, the `RegionLocalState` has been persisted so the data can be removed from `pending_create_peers`. + // More accurately, the `RegionLocalState` has been persisted so the data can be + // removed from `pending_create_peers`. let mut pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); assert_eq!( pending_create_peers.remove(&self.fsm.region_id()), @@ -4518,8 +4972,13 @@ where while let Some(result) = exec_results.pop_front() { match result { ExecResult::ChangePeer(cp) => self.on_ready_change_peer(cp), - ExecResult::CompactLog { first_index, state } => { - self.on_ready_compact_log(first_index, state) + ExecResult::CompactLog { + state, + first_index, + has_pending, + } => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + self.on_ready_compact_log(first_index, state); } ExecResult::SplitRegion { derived, @@ -4553,17 +5012,28 @@ where } ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), + ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::BatchSwitchWitness(switches) => { + self.on_ready_batch_switch_witness(switches) + } + ExecResult::HasPendingCompactCmd(has_pending) => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + if has_pending { + self.register_pull_voter_replicated_index_tick(); + } + } } } - // Update metrics only when all exec_results are finished in case the metrics is counted multiple times - // when waiting for commit merge + // Update metrics only when all exec_results are finished in case the metrics is + // counted multiple times when waiting for commit merge self.ctx.store_stat.lock_cf_bytes_written += metrics.lock_cf_written_bytes; self.ctx.store_stat.engine_total_bytes_written += metrics.written_bytes; self.ctx.store_stat.engine_total_keys_written += metrics.written_keys; } - /// Check if a request is valid if it has valid prepare_merge/commit_merge proposal. + /// Check if a request is valid if it has valid prepare_merge/commit_merge + /// proposal. fn check_merge_proposal(&self, msg: &mut RaftCmdRequest) -> Result<()> { if !msg.get_admin_request().has_prepare_merge() && !msg.get_admin_request().has_commit_merge() @@ -4609,7 +5079,7 @@ where region )); } - if !util::region_on_same_stores(target_region, region) { + if !region_on_same_stores(target_region, region) { return Err(box_err!( "peers doesn't match {:?} != {:?}, reject merge", region.get_peers(), @@ -4625,7 +5095,7 @@ where region )); } - if !util::region_on_same_stores(source_region, region) { + if !region_on_same_stores(source_region, region) { return Err(box_err!( "peers not matched: {:?} {:?}", source_region, @@ -4642,8 +5112,12 @@ where msg: &RaftCmdRequest, ) -> Result> { // Check store_id, make sure that the msg is dispatched to the right place. - if let Err(e) = util::check_store_id(msg, self.store_id()) { - self.ctx.raft_metrics.invalid_proposal.mismatch_store_id += 1; + if let Err(e) = util::check_store_id(msg.get_header(), self.store_id()) { + self.ctx + .raft_metrics + .invalid_proposal + .mismatch_store_id + .inc(); return Err(e); } if msg.has_status_request() { @@ -4653,12 +5127,23 @@ where } // Check whether the store has the right peer to handle the request. - let region_id = self.region_id(); let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); + // peer_id must be the same as peer's. + if let Err(e) = util::check_peer_id(msg.get_header(), self.fsm.peer.peer_id()) { + self.ctx + .raft_metrics + .invalid_proposal + .mismatch_peer_id + .inc(); + return Err(e); + } + if self.fsm.peer.force_leader.is_some() { - // in force leader state, forbid requests to make the recovery progress less error-prone + self.ctx.raft_metrics.invalid_proposal.force_leader.inc(); + // in force leader state, forbid requests to make the recovery progress less + // error-prone if !(msg.has_admin_request() && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) @@ -4670,13 +5155,13 @@ where // ReadIndex can be processed on the replicas. let is_read_index_request = request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; - let mut read_only = true; - for r in msg.get_requests() { - match r.get_cmd_type() { - CmdType::Get | CmdType::Snap | CmdType::ReadIndex => (), - _ => read_only = false, - } - } + let read_only = msg.get_requests().iter().all(|r| { + matches!( + r.get_cmd_type(), + CmdType::Get | CmdType::Snap | CmdType::ReadIndex, + ) + }); + let region_id = self.region_id(); let allow_replica_read = read_only && msg.get_header().get_replica_read(); let flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); let allow_stale_read = read_only && flags.contains(WriteBatchFlags::STALE_READ); @@ -4685,29 +5170,63 @@ where && !allow_replica_read && !allow_stale_read { - self.ctx.raft_metrics.invalid_proposal.not_leader += 1; + self.ctx.raft_metrics.invalid_proposal.not_leader.inc(); let leader = self.fsm.peer.get_peer_from_cache(leader_id); self.fsm.reset_hibernate_state(GroupState::Chaos); self.register_raft_base_tick(); return Err(Error::NotLeader(region_id, leader)); } - // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { - self.ctx.raft_metrics.invalid_proposal.mismatch_peer_id += 1; - return Err(e); + + // Forbid requests when it's a witness unless it's transfer leader + if self.fsm.peer.is_witness() + && !(msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::TransferLeader) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + return Err(Error::IsWitness(self.region_id())); } + + fail_point!("ignore_forbid_leader_to_be_witness", |_| Ok(None)); + + // Forbid requests to switch it into a witness when it's a leader + if self.fsm.peer.is_leader() + && msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSwitchWitness + && msg + .get_admin_request() + .get_switch_witnesses() + .get_switch_witnesses() + .iter() + .any(|s| s.get_peer_id() == self.fsm.peer.peer.get_id() && s.get_is_witness()) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests when it becomes to non-witness but not finish applying + // snapshot. + if self.fsm.peer.wait_data { + self.ctx.raft_metrics.invalid_proposal.non_witness.inc(); + return Err(Error::IsWitness(self.region_id())); + } + // check whether the peer is initialized. if !self.fsm.peer.is_initialized() { self.ctx .raft_metrics .invalid_proposal - .region_not_initialized += 1; + .region_not_initialized + .inc(); return Err(Error::RegionNotInitialized(region_id)); } - // If the peer is applying snapshot, it may drop some sending messages, that could - // make clients wait for response until timeout. + // If the peer is applying snapshot, it may drop some sending messages, that + // could make clients wait for response until timeout. if self.fsm.peer.is_handling_snapshot() { - self.ctx.raft_metrics.invalid_proposal.is_applying_snapshot += 1; + self.ctx + .raft_metrics + .invalid_proposal + .is_applying_snapshot + .inc(); // TODO: replace to a more suitable error. return Err(Error::Other(box_err!( "{} peer is applying snapshot", @@ -4715,45 +5234,77 @@ where ))); } // Check whether the term is stale. - if let Err(e) = util::check_term(msg, self.fsm.peer.term()) { - self.ctx.raft_metrics.invalid_proposal.stale_command += 1; + if let Err(e) = util::check_term(msg.get_header(), self.fsm.peer.term()) { + self.ctx.raft_metrics.invalid_proposal.stale_command.inc(); return Err(e); } - match util::check_region_epoch(msg, self.fsm.peer.region(), true) { + match util::check_req_region_epoch(msg, self.fsm.peer.region(), true) { Err(Error::EpochNotMatch(m, mut new_regions)) => { - // Attach the region which might be split from the current region. But it doesn't - // matter if the region is not split from the current region. If the region meta - // received by the TiKV driver is newer than the meta cached in the driver, the meta is - // updated. + // Attach the region which might be split from the current region. But it + // doesn't matter if the region is not split from the current region. If the + // region meta received by the TiKV driver is newer than the meta cached in the + // driver, the meta is updated. let requested_version = msg.get_header().get_region_epoch().version; self.collect_sibling_region(requested_version, &mut new_regions); - self.ctx.raft_metrics.invalid_proposal.epoch_not_match += 1; - Err(Error::EpochNotMatch(m, new_regions)) + self.ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); + return Err(Error::EpochNotMatch(m, new_regions)); + } + Err(e) => return Err(e), + _ => {} + }; + // Check whether the region is in the flashback state and the request could be + // proposed. Skip the not prepared error because the + // `self.region().is_in_flashback` may not be the latest right after applying + // the `PrepareFlashback` admin command, we will let it pass here and check in + // the apply phase and because a read-only request doesn't need to be applied, + // so it will be allowed during the flashback progress, for example, a snapshot + // request. + if let Err(e) = util::check_flashback_state( + self.region().is_in_flashback, + self.region().flashback_start_ts, + msg, + region_id, + true, + ) { + match e { + Error::FlashbackInProgress(..) => self + .ctx + .raft_metrics + .invalid_proposal + .flashback_in_progress + .inc(), + Error::FlashbackNotPrepared(_) => self + .ctx + .raft_metrics + .invalid_proposal + .flashback_not_prepared + .inc(), + _ => unreachable!(), } - Err(e) => Err(e), - Ok(()) => Ok(None), + return Err(e); } + + Ok(None) } - /// Propose batched raft commands(if any) first, then propose the given raft command. + /// Proposes pending batch raft commands (if any), then proposes the + /// provided raft command. + #[inline] fn propose_raft_command( &mut self, msg: RaftCmdRequest, cb: Callback, diskfullopt: DiskFullOpt, ) { - if let Some((request, callback)) = - self.fsm.batch_req_builder.build(&mut self.ctx.raft_metrics) - { - self.propose_raft_command_internal(request, callback, DiskFullOpt::NotAllowedOnFull); - } - + // Propose pending commands before processing new one. + self.propose_pending_batch_raft_command(); self.propose_raft_command_internal(msg, cb, diskfullopt); } /// Propose the raft command directly. - /// Note that this function introduces a reorder between this command and batched commands. + /// Note that this function introduces a reorder between this command and + /// batched commands. fn propose_raft_command_internal( &mut self, mut msg: RaftCmdRequest, @@ -4766,14 +5317,11 @@ where } if self.ctx.raft_metrics.waterfall_metrics { - if let Some(request_times) = cb.get_request_times() { - let now = TiInstant::now(); - for t in request_times { - self.ctx - .raft_metrics - .wf_batch_wait - .observe(duration_to_sec(now.saturating_duration_since(*t))); - } + let now = Instant::now(); + for tracker in cb.write_trackers() { + tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { + &mut t.metrics.wf_batch_wait_nanos + }); } } @@ -4810,9 +5358,9 @@ where } // Note: - // The peer that is being checked is a leader. It might step down to be a follower later. It - // doesn't matter whether the peer is a leader or not. If it's not a leader, the proposing - // command log entry can't be committed. + // The peer that is being checked is a leader. It might step down to be a + // follower later. It doesn't matter whether the peer is a leader or not. If + // it's not a leader, the proposing command log entry can't be committed. let mut resp = RaftCmdResponse::default(); let term = self.fsm.peer.term(); @@ -4858,7 +5406,8 @@ where collect_cnt -= 1; // For example, A is split into B, A, and then B is split into C, B. if r.get_region_epoch().version >= max_version { - // It doesn't matter if it's a false positive, as it's limited by MAX_REGIONS_IN_ERROR. + // It doesn't matter if it's a false positive, as it's limited by + // MAX_REGIONS_IN_ERROR. collect_cnt += r.get_region_epoch().version - max_version; max_version = r.get_region_epoch().version; } @@ -4879,20 +5428,22 @@ where #[allow(clippy::if_same_then_else)] fn on_raft_gc_log_tick(&mut self, force_compact: bool) { if !self.fsm.peer.is_leader() { - // `compact_cache_to` is called when apply, there is no need to call `compact_to` here, - // snapshot generating has already been cancelled when the role becomes follower. + // `compact_cache_to` is called when apply, there is no need to call + // `compact_to` here, snapshot generating has already been cancelled + // when the role becomes follower. return; } - if !self.fsm.peer.get_store().is_cache_empty() || !self.ctx.cfg.hibernate_regions { + if !self.fsm.peer.get_store().is_entry_cache_empty() || !self.ctx.cfg.hibernate_regions { self.register_raft_gc_log_tick(); } fail_point!("on_raft_log_gc_tick_1", self.fsm.peer_id() == 1, |_| {}); fail_point!("on_raft_gc_log_tick", |_| {}); debug_assert!(!self.fsm.stopped); - // As leader, we would not keep caches for the peers that didn't response heartbeat in the - // last few seconds. That happens probably because another TiKV is down. In this case if we - // do not clean up the cache, it may keep growing. + // As leader, we would not keep caches for the peers that didn't response + // heartbeat in the last few seconds. That happens probably because + // another TiKV is down. In this case if we do not clean up the cache, + // it may keep growing. let drop_cache_duration = self.ctx.cfg.raft_heartbeat_interval() + self.ctx.cfg.raft_entry_cache_life_time.0; let cache_alive_limit = Instant::now() - drop_cache_duration; @@ -4913,21 +5464,31 @@ where // `alive_cache_idx` is only used to gc cache. let applied_idx = self.fsm.peer.get_store().applied_index(); let truncated_idx = self.fsm.peer.get_store().truncated_index(); + let first_idx = self.fsm.peer.get_store().first_index(); let last_idx = self.fsm.peer.get_store().last_index(); + + let mut voter_replicated_idx = last_idx; let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if !is_learner(peer) && voter_replicated_idx > p.matched { + voter_replicated_idx = p.matched; + } if replicated_idx > p.matched { replicated_idx = p.matched; } if let Some(last_heartbeat) = self.fsm.peer.peer_heartbeats.get(peer_id) { - if alive_cache_idx > p.matched - && p.matched >= truncated_idx - && *last_heartbeat > cache_alive_limit - { - alive_cache_idx = p.matched; + if *last_heartbeat > cache_alive_limit { + if alive_cache_idx > p.matched && p.matched >= truncated_idx { + alive_cache_idx = p.matched; + } else if p.matched == 0 { + // the new peer is still applying snapshot, do not compact cache now + alive_cache_idx = 0; + } } } } + // When an election happened or a new peer is added, replicated_idx can be 0. if replicated_idx > 0 { assert!( @@ -4938,21 +5499,20 @@ where ); REGION_MAX_LOG_LAG.observe((last_idx - replicated_idx) as f64); } + + // leader may call `get_term()` on the latest replicated index, so compact + // entries before `alive_cache_idx` instead of `alive_cache_idx + 1`. self.fsm .peer .mut_store() - .maybe_gc_cache(alive_cache_idx, applied_idx); + .compact_entry_cache(std::cmp::min(alive_cache_idx, applied_idx + 1)); if needs_evict_entry_cache(self.ctx.cfg.evict_cache_on_memory_ratio) { - self.fsm.peer.mut_store().evict_cache(true); - if !self.fsm.peer.get_store().cache_is_empty() { + self.fsm.peer.mut_store().evict_entry_cache(true); + if !self.fsm.peer.get_store().is_entry_cache_empty() { self.register_entry_cache_evict_tick(); } } - let mut total_gc_logs = 0; - - let first_idx = self.fsm.peer.get_store().first_index(); - let mut compact_idx = if force_compact && replicated_idx > first_idx { replicated_idx } else if (applied_idx > first_idx @@ -4961,17 +5521,23 @@ where { std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) } else if replicated_idx < first_idx || last_idx - first_idx < 3 { - // In the current implementation one compaction can't delete all stale Raft logs. - // There will be at least 3 entries left after one compaction: + // In the current implementation one compaction can't delete all stale Raft + // logs. There will be at least 3 entries left after one compaction: + // ``` // |------------- entries needs to be compacted ----------| // [entries...][the entry at `compact_idx`][the last entry][new compaction entry] // |-------------------- entries will be left ----------------------| - self.ctx.raft_metrics.raft_log_gc_skipped.reserve_log += 1; + // ``` + self.ctx.raft_metrics.raft_log_gc_skipped.reserve_log.inc(); return; } else if replicated_idx - first_idx < self.ctx.cfg.raft_log_gc_threshold && self.fsm.skip_gc_raft_log_ticks < self.ctx.cfg.raft_log_reserve_max_ticks { - self.ctx.raft_metrics.raft_log_gc_skipped.threshold_limit += 1; + self.ctx + .raft_metrics + .raft_log_gc_skipped + .threshold_limit + .inc(); // Logs will only be kept `max_ticks` * `raft_log_gc_tick_interval`. self.fsm.skip_gc_raft_log_ticks += 1; self.register_raft_gc_log_tick(); @@ -4987,16 +5553,17 @@ where self.ctx .raft_metrics .raft_log_gc_skipped - .compact_idx_too_small += 1; + .compact_idx_too_small + .inc(); return; } - total_gc_logs += compact_idx - first_idx; // Create a compact log request and notify directly. let region_id = self.fsm.peer.region().get_id(); let peer = self.fsm.peer.peer.clone(); let term = self.fsm.peer.get_index_term(compact_idx); - let request = new_compact_log_request(region_id, peer, compact_idx, term); + let request = + new_compact_log_request(region_id, peer, compact_idx, term, voter_replicated_idx); self.propose_raft_command_internal( request, Callback::None, @@ -5005,7 +5572,7 @@ where self.fsm.skip_gc_raft_log_ticks = 0; self.register_raft_gc_log_tick(); - PEER_GC_RAFT_LOG_COUNTER.inc_by(total_gc_logs); + PEER_GC_RAFT_LOG_COUNTER.inc_by(compact_idx - first_idx); } fn register_entry_cache_evict_tick(&mut self) { @@ -5015,14 +5582,81 @@ where fn on_entry_cache_evict_tick(&mut self) { fail_point!("on_entry_cache_evict_tick", |_| {}); if needs_evict_entry_cache(self.ctx.cfg.evict_cache_on_memory_ratio) { - self.fsm.peer.mut_store().evict_cache(true); + self.fsm.peer.mut_store().evict_entry_cache(true); + if !self.fsm.peer.get_store().is_entry_cache_empty() { + self.register_entry_cache_evict_tick(); + } } - let mut _usage = 0; - if memory_usage_reaches_high_water(&mut _usage) - && !self.fsm.peer.get_store().cache_is_empty() + } + + fn register_check_long_uncommitted_tick(&mut self) { + self.schedule_tick(PeerTick::CheckLongUncommitted) + } + + fn on_check_long_uncommitted_tick(&mut self) { + if !self.fsm.peer.is_leader() || self.fsm.hibernate_state.group_state() == GroupState::Idle { - self.register_entry_cache_evict_tick(); + return; } + self.fsm.peer.check_long_uncommitted_proposals(self.ctx); + self.register_check_long_uncommitted_tick(); + } + + fn on_request_snapshot_tick(&mut self) { + fail_point!("ignore request snapshot", |_| { + self.schedule_tick(PeerTick::RequestSnapshot); + }); + if !self.fsm.peer.wait_data { + return; + } + if self.fsm.peer.is_leader() + || self.fsm.peer.is_handling_snapshot() + || self.fsm.peer.has_pending_snapshot() + { + self.schedule_tick(PeerTick::RequestSnapshot); + return; + } + self.fsm.peer.request_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + let last_term = self.fsm.peer.get_index_term(self.fsm.peer.request_index); + if last_term == self.fsm.peer.term() { + self.fsm.peer.should_reject_msgappend = true; + if let Err(e) = self.fsm.peer.raft_group.request_snapshot() { + error!( + "failed to request snapshot"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + ); + } + } else { + // If a leader change occurs after switch to non-witness, it should be + // continue processing `MsgAppend` until `last_term == term`, then retry + // to request snapshot. + self.fsm.peer.should_reject_msgappend = false; + } + // Requesting a snapshot may fail, so register a periodic event as a defense + // until succeeded. + self.schedule_tick(PeerTick::RequestSnapshot); + } + + fn on_request_voter_replicated_index(&mut self) { + if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { + return; + } + if self.fsm.peer.last_compacted_time.elapsed() + > self.ctx.cfg.request_voter_replicated_index_interval.0 + { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgVoterReplicatedIndexRequest); + let leader_id = self.fsm.peer.leader_id(); + let leader = self.fsm.peer.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &leader); + } + } + self.register_pull_voter_replicated_index_tick(); } fn register_check_leader_lease_tick(&mut self) { @@ -5039,7 +5673,7 @@ where } fn register_split_region_check_tick(&mut self) { - self.schedule_tick(PeerTick::SplitRegionCheck) + self.schedule_tick(PeerTick::SplitRegionCheck); } #[inline] @@ -5053,13 +5687,13 @@ where return; } - // When restart, the may_skip_split_check will be false. The split check will first - // check the region size, and then check whether the region should split. This - // should work even if we change the region max size. + // When restart, the may_skip_split_check will be false. The split check will + // first check the region size, and then check whether the region should split. + // This should work even if we change the region max size. // If peer says should update approximate size, update region size and check // whether the region should split. - // We assume that `may_skip_split_check` is only set true after the split check task is - // scheduled. + // We assume that `may_skip_split_check` is only set true after the split check + // task is scheduled. if self.fsm.peer.may_skip_split_check && self.fsm.peer.compaction_declined_bytes < self.ctx.cfg.region_split_check_diff().0 && self.fsm.peer.size_diff_hint < self.ctx.cfg.region_split_check_diff().0 @@ -5077,19 +5711,20 @@ where return; } - // When Lightning or BR is importing data to TiKV, their ingest-request may fail because of - // region-epoch not matched. So we hope TiKV do not check region size and split region during - // importing. + // When Lightning or BR is importing data to TiKV, their ingest-request may fail + // because of region-epoch not matched. So we hope TiKV do not check region size + // and split region during importing. if self.ctx.importer.get_mode() == SwitchMode::Import { return; } - // bulk insert too fast may cause snapshot stale very soon, worst case it stale before - // sending. so when snapshot is generating or sending, skip split check at most 3 times. - // There is a trade off between region size and snapshot success rate. Split check is - // triggered every 10 seconds. If a snapshot can't be generated in 30 seconds, it might be - // just too large to be generated. Split it into smaller size can help generation. check - // issue 330 for more info. + // bulk insert too fast may cause snapshot stale very soon, worst case it stale + // before sending. so when snapshot is generating or sending, skip split check + // at most 3 times. There is a trade off between region size and snapshot + // success rate. Split check is triggered every 10 seconds. If a snapshot can't + // be generated in 30 seconds, it might be just too large to be generated. Split + // it into smaller size can help generation. check issue 330 for more + // info. if self.fsm.peer.get_store().is_generating_snapshot() && self.fsm.skip_split_count < self.region_split_skip_max_count() { @@ -5132,7 +5767,34 @@ where "split_keys" => %KeysInfoFormatter(split_keys.iter()), "source" => source, ); - if let Err(e) = self.validate_split_region(®ion_epoch, &split_keys) { + + if !self.fsm.peer.is_leader() { + // region on this store is no longer leader, skipped. + info!( + "not leader, skip proposing split"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + cb.invoke_with_response(new_error(Error::NotLeader( + self.region_id(), + self.fsm.peer.get_peer_from_cache(self.fsm.peer.leader_id()), + ))); + return; + } + if let Err(e) = util::validate_split_region( + self.fsm.region_id(), + self.fsm.peer_id(), + self.region(), + ®ion_epoch, + &split_keys, + ) { + info!( + "invalid split request"; + "err" => ?e, + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "source" => %source + ); cb.invoke_with_response(new_error(e)); return; } @@ -5162,70 +5824,6 @@ where } } - fn validate_split_region( - &mut self, - epoch: &metapb::RegionEpoch, - split_keys: &[Vec], - ) -> Result<()> { - if split_keys.is_empty() { - error!( - "no split key is specified."; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(box_err!("{} no split key is specified.", self.fsm.peer.tag)); - } - for key in split_keys { - if key.is_empty() { - error!( - "split key should not be empty!!!"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(box_err!( - "{} split key should not be empty", - self.fsm.peer.tag - )); - } - } - if !self.fsm.peer.is_leader() { - // region on this store is no longer leader, skipped. - info!( - "not leader, skip."; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(Error::NotLeader( - self.region_id(), - self.fsm.peer.get_peer_from_cache(self.fsm.peer.leader_id()), - )); - } - - let region = self.fsm.peer.region(); - let latest_epoch = region.get_region_epoch(); - - // This is a little difference for `check_region_epoch` in region split case. - // Here we just need to check `version` because `conf_ver` will be update - // to the latest value of the peer, and then send to PD. - if latest_epoch.get_version() != epoch.get_version() { - info!( - "epoch changed, retry later"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - "prev_epoch" => ?region.get_region_epoch(), - "epoch" => ?epoch, - ); - return Err(Error::EpochNotMatch( - format!( - "{} epoch changed {:?} != {:?}, retry later", - self.fsm.peer.tag, latest_epoch, epoch - ), - vec![region.to_owned()], - )); - } - Ok(()) - } - fn on_approximate_region_size(&mut self, size: u64) { self.fsm.peer.approximate_size = Some(size); self.register_split_region_check_tick(); @@ -5389,9 +5987,7 @@ where }; meta.keys.insert(0, region.get_start_key().to_vec()); meta.keys.push(region.get_end_key().to_vec()); - - let stats = new_bucket_stats(&meta); - region_buckets = BucketStat::new(Arc::new(meta), stats); + region_buckets = BucketStat::from_meta(Arc::new(meta)); } let buckets_count = region_buckets.meta.keys.len() - 1; @@ -5432,7 +6028,7 @@ where // generate bucket range list to run split-check (to further split buckets) fn gen_bucket_range_for_update(&self) -> Option> { - if !self.ctx.coprocessor_host.cfg.enable_region_bucket { + if !self.ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } let region_buckets = self.fsm.peer.region_buckets.as_ref()?; @@ -5484,14 +6080,18 @@ where fn on_schedule_half_split_region( &mut self, region_epoch: &metapb::RegionEpoch, + start_key: Option>, + end_key: Option>, policy: CheckPolicy, source: &str, _cb: Callback, ) { + let is_key_range = start_key.is_some() && end_key.is_some(); info!( "on half split"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, "policy" => ?policy, "source" => source, ); @@ -5501,6 +6101,7 @@ where "not leader, skip"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, ); return; } @@ -5511,11 +6112,18 @@ where "receive a stale halfsplit message"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, ); return; } - let split_check_bucket_ranges = self.gen_bucket_range_for_update(); + // Do not check the bucket ranges if we want to split the region with a given + // key range, this is to avoid compatibility issues. + let split_check_bucket_ranges = if !is_key_range { + self.gen_bucket_range_for_update() + } else { + None + }; #[cfg(any(test, feature = "testexport"))] { if let Callback::Test { cb } = _cb { @@ -5526,13 +6134,20 @@ where cb(peer_stat); } } - let task = - SplitCheckTask::split_check(region.clone(), false, policy, split_check_bucket_ranges); + let task = SplitCheckTask::split_check_key_range( + region.clone(), + start_key, + end_key, + false, + policy, + split_check_bucket_ranges, + ); if let Err(e) = self.ctx.split_check_scheduler.schedule(task) { error!( "failed to schedule split check"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, "err" => %e, ); } @@ -5557,6 +6172,43 @@ where self.schedule_tick(PeerTick::PdHeartbeat) } + fn register_check_peers_availability_tick(&mut self) { + fail_point!("ignore schedule check peers availability tick", |_| {}); + self.schedule_tick(PeerTick::CheckPeersAvailability) + } + + fn on_check_peers_availability(&mut self) { + let mut invalid_peers: Vec = Vec::new(); + for peer_id in self.fsm.peer.wait_data_peers.iter() { + match self.fsm.peer.get_peer_from_cache(*peer_id) { + Some(peer) => { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityRequest); + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &peer); + debug!( + "check peer availability"; + "target peer id" => *peer_id, + ); + } + None => invalid_peers.push(*peer_id), + } + } + // For some reasons, the peer corresponding to the previously saved peer_id + // no longer exists. In order to avoid passing invalid information to pd when + // reporting pending peers and affecting pd scheduling, remove it from the + // `wait_data_peers`. + self.fsm + .peer + .wait_data_peers + .retain(|peer_id| !invalid_peers.contains(peer_id)); + } + + fn register_pull_voter_replicated_index_tick(&mut self) { + self.schedule_tick(PeerTick::RequestVoterReplicatedIndex); + } + fn on_check_peer_stale_state_tick(&mut self) { if self.fsm.peer.pending_remove { return; @@ -5568,6 +6220,19 @@ where return; } + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // Clean up the force leader state after a timeout, since the PD recovery + // process may have been aborted for some reasons. + if time.saturating_elapsed() + > cmp::max( + self.ctx.cfg.peer_stale_state_check_interval.0, + Duration::from_secs(60), + ) + { + self.on_exit_force_leader(); + } + } + if self.ctx.cfg.hibernate_regions { let group_state = self.fsm.hibernate_state.group_state(); if group_state == GroupState::Idle { @@ -5608,8 +6273,9 @@ where // from the cluster or probably destroyed. // Meantime, D, E, F would not reach B, since it's not in the cluster anymore. // In this case, peer B would notice that the leader is missing for a long time, - // and it would check with pd to confirm whether it's still a member of the cluster. - // If not, it destroys itself as a stale peer which is removed out already. + // and it would check with pd to confirm whether it's still a member of the + // cluster. If not, it destroys itself as a stale peer which is removed out + // already. let state = self.fsm.peer.check_stale_state(self.ctx); fail_point!("peer_check_stale_state", state != StaleState::Valid, |_| {}); match state { @@ -5667,8 +6333,8 @@ where fn on_reactivate_memory_lock_tick(&mut self) { let mut pessimistic_locks = self.fsm.peer.txn_ext.pessimistic_locks.write(); - // If it is not leader, we needn't reactivate by tick. In-memory pessimistic lock will - // be enabled when this region becomes leader again. + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. // And this tick is currently only used for the leader transfer failure case. if !self.fsm.peer.is_leader() || pessimistic_locks.status != LocksStatus::TransferringLeader { @@ -5677,8 +6343,8 @@ where self.fsm.reactivate_memory_lock_ticks += 1; let transferring_leader = self.fsm.peer.raft_group.raft.lead_transferee.is_some(); - // `lead_transferee` is not set immediately after the lock status changes. So, we need - // the tick count condition to avoid reactivating too early. + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. if !transferring_leader && self.fsm.reactivate_memory_lock_ticks >= self.ctx.cfg.reactive_memory_lock_timeout_tick @@ -5714,6 +6380,7 @@ where "err" => ?e, ); } + // todo: it will delete in next pr. region_buckets.stats = new_bucket_stats(®ion_buckets.meta); self.register_report_region_buckets_tick(); @@ -5787,8 +6454,8 @@ where Some(self.fsm.peer.approximate_size.unwrap_or_default() + size); self.fsm.peer.approximate_keys = Some(self.fsm.peer.approximate_keys.unwrap_or_default() + keys); - // The ingested file may be overlapped with the data in engine, so we need to check it - // again to get the accurate value. + // The ingested file may be overlapped with the data in engine, so we need to + // check it again to get the accurate value. self.fsm.peer.may_skip_split_check = false; if self.fsm.peer.is_leader() { self.on_pd_heartbeat_tick(); @@ -5797,23 +6464,82 @@ where } fn on_transfer_leader(&mut self, term: u64) { - // If the term has changed between proposing and executing the TransferLeader request, - // ignore it because this request may be stale. + // If the term has changed between proposing and executing the TransferLeader + // request, ignore it because this request may be stale. if term != self.fsm.peer.term() { return; } - // As the leader can propose the TransferLeader request successfully, the disk of - // the leader is probably not full. - self.fsm.peer.execute_transfer_leader( - self.ctx, - self.fsm.peer.leader_id(), - DiskUsage::Normal, - true, - ); + self.fsm.peer.ack_transfer_leader_msg(true); self.fsm.has_ready = true; } - /// Verify and store the hash to state. return true means the hash has been stored successfully. + fn on_set_flashback_state(&mut self, region: metapb::Region) { + // Update the region meta. + self.update_region((|| { + #[cfg(feature = "failpoints")] + fail_point!("keep_peer_fsm_flashback_state_false", |_| { + let mut region = region.clone(); + region.is_in_flashback = false; + region + }); + region + })()); + // Let the leader lease to None to ensure that local reads are not executed. + self.fsm.peer.leader_lease_mut().expire_remote_lease(); + } + + fn on_ready_batch_switch_witness(&mut self, sw: SwitchWitness) { + { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.set_region( + &self.ctx.coprocessor_host, + sw.region, + &mut self.fsm.peer, + RegionChangeReason::SwitchWitness, + ); + } + for s in sw.switches { + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + if self.fsm.peer_id() == peer_id { + if is_witness { + self.fsm.peer.raft_group.set_priority(-1); + if !self.fsm.peer.is_leader() { + let _ = self.fsm.peer.get_store().clear_data(); + } else { + // Avoid calling `clear_data` as the region worker may be scanning snapshot, + // to avoid problems (although no problems were found by testing). + self.fsm.peer.delay_clean_data = true; + } + } else { + self.fsm + .peer + .update_read_progress(self.ctx, ReadProgress::WaitData(true)); + self.fsm.peer.wait_data = true; + self.on_request_snapshot_tick(); + } + self.fsm.peer.peer.is_witness = is_witness; + continue; + } + if !is_witness && !self.fsm.peer.wait_data_peers.contains(&peer_id) { + self.fsm.peer.wait_data_peers.push(peer_id); + } + } + if self.fsm.peer.is_leader() { + info!( + "notify pd with change peer region"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "region" => ?self.fsm.peer.region(), + ); + self.fsm.peer.heartbeat_pd(self.ctx); + if !self.fsm.peer.wait_data_peers.is_empty() { + self.register_check_peers_availability_tick(); + } + } + } + + /// Verify and store the hash to state. return true means the hash has been + /// stored successfully. // TODO: Consider context in the function. fn verify_and_store_hash( &mut self, @@ -5863,8 +6589,9 @@ where if self.fsm.peer.consistency_state.index != INVALID_INDEX && !self.fsm.peer.consistency_state.hash.is_empty() { - // Maybe computing is too slow or computed result is dropped due to channel full. - // If computing is too slow, miss count will be increased twice. + // Maybe computing is too slow or computed result is dropped due to channel + // full. If computing is too slow, miss count will be increased + // twice. REGION_HASH_COUNTER.verify.miss.inc(); warn!( "hash belongs to wrong index, skip."; @@ -5887,15 +6614,17 @@ where } } -/// Checks merge target, returns whether the source peer should be destroyed and whether the source peer is -/// merged to this target peer. +/// Checks merge target, returns whether the source peer should be destroyed and +/// whether the source peer is merged to this target peer. /// /// It returns (`can_destroy`, `merge_to_this_peer`). /// -/// `can_destroy` is true when there is a network isolation which leads to a follower of a merge target -/// Region's log falls behind and then receive a snapshot with epoch version after merge. +/// `can_destroy` is true when there is a network isolation which leads to a +/// follower of a merge target Region's log falls behind and then receive a +/// snapshot with epoch version after merge. /// -/// `merge_to_this_peer` is true when `can_destroy` is true and the source peer is merged to this target peer. +/// `merge_to_this_peer` is true when `can_destroy` is true and the source peer +/// is merged to this target peer. pub fn maybe_destroy_source( meta: &StoreMeta, target_region_id: u64, @@ -5912,18 +6641,19 @@ pub fn maybe_destroy_source( region_epoch, target_region.get_region_epoch(), ); - // The target peer will move on, namely, it will apply a snapshot generated after merge, - // so destroy source peer. + // The target peer will move on, namely, it will apply a snapshot generated + // after merge, so destroy source peer. if region_epoch.get_version() > target_region.get_region_epoch().get_version() { return ( true, target_peer_id - == util::find_peer(target_region, meta.store_id.unwrap()) + == find_peer(target_region, meta.store_id.unwrap()) .unwrap() .get_id(), ); } - // Wait till the target peer has caught up logs and source peer will be destroyed at that time. + // Wait till the target peer has caught up logs and source peer will be + // destroyed at that time. return (false, false); } } @@ -5972,6 +6702,7 @@ fn new_compact_log_request( peer: metapb::Peer, compact_index: u64, compact_term: u64, + voter_replicated_index: u64, ) -> RaftCmdRequest { let mut request = new_admin_request(region_id, peer); @@ -5979,6 +6710,9 @@ fn new_compact_log_request( admin.set_cmd_type(AdminCmdType::CompactLog); admin.mut_compact_log().set_compact_index(compact_index); admin.mut_compact_log().set_compact_term(compact_term); + admin + .mut_compact_log() + .set_voter_replicated_index(voter_replicated_index); request.set_admin_request(admin); request } @@ -6087,30 +6821,6 @@ where } } -impl AbstractPeer for PeerFsm { - fn meta_peer(&self) -> &metapb::Peer { - &self.peer.peer - } - fn group_state(&self) -> GroupState { - self.hibernate_state.group_state() - } - fn region(&self) -> &metapb::Region { - self.peer.raft_group.store().region() - } - fn apply_state(&self) -> &RaftApplyState { - self.peer.raft_group.store().apply_state() - } - fn raft_status(&self) -> raft::Status<'_> { - self.peer.raft_group.status() - } - fn raft_commit_index(&self) -> u64 { - self.peer.raft_group.store().commit_index() - } - fn pending_merge_state(&self) -> Option<&MergeState> { - self.peer.pending_merge_state.as_ref() - } -} - mod memtrace { use memory_trace_macros::MemoryTraceHelper; diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 54f4f45f9ab..03c0688e8f2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -10,10 +10,7 @@ use std::{ }, mem, ops::{Deref, DerefMut}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Mutex, - }, + sync::{atomic::Ordering, Arc, Mutex}, time::{Duration, Instant}, u64, }; @@ -22,14 +19,16 @@ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, Config as BatchSystemConfig, Fsm, HandleResult, HandlerBuilder, PollHandler, Priority, }; +use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashMapEntry, HashSet}; use concurrency_manager::ConcurrencyManager; -use crossbeam::channel::{unbounded, Sender, TryRecvError, TrySendError}; +use crossbeam::channel::{TryRecvError, TrySendError}; use engine_traits::{ CompactedEvent, DeleteStrategy, Engines, KvEngine, Mutable, PerfContextKind, RaftEngine, RaftLogBatch, Range, WriteBatch, WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use fail::fail_point; +use file_system::{IoType, WithIoType}; use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; @@ -38,23 +37,26 @@ use kvproto::{ metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, - raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, + raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; +use resource_control::{channel::unbounded, ResourceGroupManager}; use resource_metering::CollectorRegHandle; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; use tikv_util::{ - box_err, box_try, + box_try, config::{Tracker, VersionTrack}, debug, defer, error, future::poll_future_notify, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, - slow_log, sys as sys_util, + slow_log, + store::{find_peer, region_on_stores}, + sys as sys_util, sys::disk::{get_disk_status, DiskUsage}, time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, @@ -66,15 +68,17 @@ use time::{self, Timespec}; use crate::{ bytes_capacity, - coprocessor::{ - split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, RegionChangeEvent, - RegionChangeReason, - }, + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ - async_io::write::{StoreWriters, Worker as WriteWorker, WriteMsg}, + async_io::{ + read::{ReadRunner, ReadTask}, + write::{StoreWriters, StoreWritersContext, Worker as WriteWorker, WriteMsg}, + write_router::WriteSenders, + }, config::Config, fsm::{ create_apply_batch_system, + life::handle_tombstone_message_on_learner, metrics::*, peer::{ maybe_destroy_source, new_admin_request, PeerFsm, PeerFsmDelegate, SenderFsmPair, @@ -82,7 +86,7 @@ use crate::{ ApplyBatchSystem, ApplyNotifier, ApplyPollerBuilder, ApplyRes, ApplyRouter, ApplyTaskRes, }, - local_metrics::{RaftMetrics, RaftReadyMetrics}, + local_metrics::RaftMetrics, memory::*, metrics::*, peer_storage, @@ -92,21 +96,20 @@ use crate::{ worker::{ AutoSplitController, CleanupRunner, CleanupSstRunner, CleanupSstTask, CleanupTask, CompactRunner, CompactTask, ConsistencyCheckRunner, ConsistencyCheckTask, - GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogFetchRunner, RaftlogFetchTask, - RaftlogGcRunner, RaftlogGcTask, ReadDelegate, RefreshConfigRunner, RefreshConfigTask, - RegionRunner, RegionTask, SplitCheckTask, + GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogGcRunner, RaftlogGcTask, + ReadDelegate, RefreshConfigRunner, RefreshConfigTask, RegionRunner, RegionTask, + SplitCheckTask, }, Callback, CasualMessage, GlobalReplicationState, InspectedRaftMessage, MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, SignificantMsg, SnapManager, StoreMsg, StoreTick, }, - Result, + Error, Result, }; type Key = Vec; pub const PENDING_MSG_CAP: usize = 100; -const UNREACHABLE_BACKOFF: Duration = Duration::from_secs(10); -const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); +pub const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region pub struct StoreInfo { @@ -115,8 +118,16 @@ pub struct StoreInfo { pub capacity: u64, } +/// A trait that provide the meta information that can be accessed outside +/// of raftstore. +pub trait StoreRegionMeta: Send { + fn store_id(&self) -> u64; + fn reader(&self, region_id: u64) -> Option<&ReadDelegate>; + fn region_read_progress(&self) -> &RegionReadProgressRegistry; + fn search_region(&self, start_key: &[u8], end_key: &[u8], visitor: impl FnMut(&Region)); +} + pub struct StoreMeta { - /// store id pub store_id: Option, /// region_end_key -> region_id pub region_ranges: BTreeMap, u64>, @@ -124,25 +135,30 @@ pub struct StoreMeta { pub regions: HashMap, /// region_id -> reader pub readers: HashMap, - /// `MsgRequestPreVote`, `MsgRequestVote` or `MsgAppend` messages from newly split Regions shouldn't be - /// dropped if there is no such Region in this store now. So the messages are recorded temporarily and - /// will be handled later. + /// `MsgRequestPreVote`, `MsgRequestVote` or `MsgAppend` messages from newly + /// split Regions shouldn't be dropped if there is no such Region in this + /// store now. So the messages are recorded temporarily and will be handled + /// later. pub pending_msgs: RingQueue, /// The regions with pending snapshots. pub pending_snapshot_regions: Vec, - /// A marker used to indicate the peer of a Region has received a merge target message and waits to be destroyed. - /// target_region_id -> (source_region_id -> merge_target_region) + /// A marker used to indicate the peer of a Region has received a merge + /// target message and waits to be destroyed. target_region_id -> + /// (source_region_id -> merge_target_region) pub pending_merge_targets: HashMap>, - /// An inverse mapping of `pending_merge_targets` used to let source peer help target peer to clean up related entry. - /// source_region_id -> target_region_id + /// An inverse mapping of `pending_merge_targets` used to let source peer + /// help target peer to clean up related entry. source_region_id -> + /// target_region_id pub targets_map: HashMap, - /// `atomic_snap_regions` and `destroyed_region_for_snap` are used for making destroy overlapped regions - /// and apply snapshot atomically. + /// `atomic_snap_regions` and `destroyed_region_for_snap` are used for + /// making destroy overlapped regions and apply snapshot atomically. /// region_id -> wait_destroy_regions_map(source_region_id -> is_ready) - /// A target peer must wait for all source peer to ready before applying snapshot. + /// A target peer must wait for all source peer to ready before applying + /// snapshot. pub atomic_snap_regions: HashMap>, /// source_region_id -> need_atomic - /// Used for reminding the source peer to switch to ready in `atomic_snap_regions`. + /// Used for reminding the source peer to switch to ready in + /// `atomic_snap_regions`. pub destroyed_region_for_snap: HashMap, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, @@ -150,6 +166,39 @@ pub struct StoreMeta { pub damaged_ranges: HashMap, Vec)>, } +impl StoreRegionMeta for StoreMeta { + #[inline] + fn store_id(&self) -> u64 { + self.store_id.unwrap() + } + + #[inline] + fn search_region(&self, start_key: &[u8], end_key: &[u8], mut visitor: impl FnMut(&Region)) { + let start_key = data_key(start_key); + for (_, id) in self + .region_ranges + .range((Excluded(start_key), Unbounded::>)) + { + let region = &self.regions[id]; + if end_key.is_empty() || end_key > region.get_start_key() { + visitor(region); + } else { + break; + } + } + } + + #[inline] + fn region_read_progress(&self) -> &RegionReadProgressRegistry { + &self.region_read_progress + } + + #[inline] + fn reader(&self, region_id: u64) -> Option<&ReadDelegate> { + self.readers.get(®ion_id) + } +} + impl StoreMeta { pub fn new(vote_capacity: usize) -> StoreMeta { StoreMeta { @@ -191,7 +240,8 @@ impl StoreMeta { /// end_key > file.smallestkey /// start_key <= file.largestkey pub fn update_overlap_damaged_ranges(&mut self, fname: &str, start: &[u8], end: &[u8]) -> bool { - // `region_ranges` is promised to have no overlap so just check the first region. + // `region_ranges` is promised to have no overlap so just check the first + // region. if let Some((_, id)) = self .region_ranges .range((Excluded(start.to_owned()), Unbounded::>)) @@ -286,16 +336,21 @@ where { fn notify(&self, apply_res: Vec>) { for r in apply_res { - self.router.try_send( - r.region_id, + let region_id = r.region_id; + if let Err(e) = self.router.force_send( + region_id, PeerMsg::ApplyRes { res: ApplyTaskRes::Apply(r), }, - ); + ) { + error!("failed to send apply result"; "region_id" => region_id, "err" => ?e); + } } } fn notify_one(&self, region_id: u64, msg: PeerMsg) { - self.router.try_send(region_id, msg); + if let Err(e) = self.router.force_send(region_id, msg) { + error!("failed to notify apply msg"; "region_id" => region_id, "err" => ?e); + } } fn clone_box(&self) -> Box> { @@ -424,6 +479,22 @@ pub struct PeerTickBatch { pub wait_duration: Duration, } +impl PeerTickBatch { + #[inline] + pub fn schedule(&mut self, timer: &SteadyTimer) { + if self.ticks.is_empty() { + return; + } + let peer_ticks = mem::take(&mut self.ticks); + let f = timer.delay(self.wait_duration).compat().map(move |_| { + for tick in peer_ticks { + tick(); + } + }); + poll_future_notify(f); + } +} + impl Clone for PeerTickBatch { fn clone(&self) -> PeerTickBatch { PeerTickBatch { @@ -446,7 +517,7 @@ where // handle Compact, CleanupSst task pub cleanup_scheduler: Scheduler, pub raftlog_gc_scheduler: Scheduler, - pub raftlog_fetch_scheduler: Scheduler, + pub raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, pub apply_router: ApplyRouter, pub router: RaftRouter, @@ -455,11 +526,12 @@ where pub feature_gate: FeatureGate, /// region_id -> (peer_id, is_splitting) /// Used for handling race between splitting and creating new peer. - /// An uninitialized peer can be replaced to the one from splitting iff they are exactly the same peer. + /// An uninitialized peer can be replaced to the one from splitting iff they + /// are exactly the same peer. /// /// WARNING: - /// To avoid deadlock, if you want to use `store_meta` and `pending_create_peers` together, - /// the lock sequence MUST BE: + /// To avoid deadlock, if you want to use `store_meta` and + /// `pending_create_peers` together, the lock sequence MUST BE: /// 1. lock the store_meta. /// 2. lock the pending_create_peers. pub pending_create_peers: Arc>>, @@ -469,8 +541,8 @@ where pub timer: SteadyTimer, pub trans: T, /// WARNING: - /// To avoid deadlock, if you want to use `store_meta` and `global_replication_state` together, - /// the lock sequence MUST BE: + /// To avoid deadlock, if you want to use `store_meta` and + /// `global_replication_state` together, the lock sequence MUST BE: /// 1. lock the store_meta. /// 2. lock the global_replication_state. pub global_replication_state: Arc>, @@ -481,7 +553,8 @@ where pub ready_count: usize, pub has_ready: bool, pub current_time: Option, - pub perf_context: EK::PerfContext, + pub raft_perf_context: ER::PerfContext, + pub kv_perf_context: EK::PerfContext, pub tick_batch: Vec, pub node_start_time: Option, /// Disk usage for the store itself. @@ -491,9 +564,8 @@ where /// Disk usage for other stores. The store itself is not included. /// Only contains items which is not `DiskUsage::Normal`. pub store_disk_usages: HashMap, - pub write_senders: Vec>>, + pub write_senders: WriteSenders, pub sync_write_worker: Option, T>>, - pub io_reschedule_concurrent_count: Arc, pub pending_latency_inspect: Vec, } @@ -527,6 +599,15 @@ where self.cfg.reactive_memory_lock_tick_interval.0; self.tick_batch[PeerTick::ReportBuckets as usize].wait_duration = self.cfg.report_region_buckets_tick_interval.0; + self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = + self.cfg.check_long_uncommitted_interval.0; + self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = + self.cfg.check_peers_availability_interval.0; + self.tick_batch[PeerTick::RequestSnapshot as usize].wait_duration = + self.cfg.check_request_snapshot_interval.0; + // TODO: make it reasonable + self.tick_batch[PeerTick::RequestVoterReplicatedIndex as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0 * 2; } } @@ -570,7 +651,7 @@ where "msg_type" => ?msg_type, ); - self.raft_metrics.message_dropped.stale_msg += 1; + self.raft_metrics.message_dropped.stale_msg.inc(); let mut gc_msg = RaftMessage::default(); gc_msg.set_region_id(region_id); @@ -598,7 +679,12 @@ struct Store { stopped: bool, start_time: Option, consistency_check_time: HashMap, - last_unreachable_report: HashMap, + store_reachability: HashMap, +} + +struct StoreReachability { + last_broadcast: Instant, + received_message_count: u64, } pub struct StoreFsm @@ -622,7 +708,7 @@ where stopped: false, start_time: None, consistency_check_time: HashMap::default(), - last_unreachable_report: HashMap::default(), + store_reachability: HashMap::default(), }, receiver: rx, }); @@ -651,7 +737,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { fn on_tick(&mut self, tick: StoreTick) { - let t = TiInstant::now_coarse(); + let timer = TiInstant::now_coarse(); match tick { StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat_tick(), StoreTick::SnapGc => self.on_snap_mgr_gc(), @@ -660,10 +746,12 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreTick::ConsistencyCheck => self.on_consistency_check_tick(), StoreTick::CleanupImportSst => self.on_cleanup_import_sst_tick(), } - let elapsed = t.saturating_elapsed(); - RAFT_EVENT_DURATION + let elapsed = timer.saturating_elapsed(); + self.ctx + .raft_metrics + .event_time .get(tick.tag()) - .observe(duration_to_sec(elapsed) as f64); + .observe(duration_to_sec(elapsed)); slow_log!( elapsed, "[store {}] handle timeout {:?}", @@ -673,15 +761,29 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } fn handle_msgs(&mut self, msgs: &mut Vec>) { + let timer = TiInstant::now_coarse(); for m in msgs.drain(..) { match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { - error!(?e; - "handle raft message failed"; - "store_id" => self.fsm.store.id, - ); + if matches!(&e, Error::RegionNotRegistered { .. }) { + // This may happen in normal cases when add-peer runs slowly + // occasionally after a region split. Avoid printing error + // log here, which may confuse users. + info!("handle raft message failed"; + "err" => ?e, + "store_id" => self.fsm.store.id, + ); + } else { + error!(?e; + "handle raft message failed"; + "store_id" => self.fsm.store.id, + ); + } } } StoreMsg::CompactedEvent(event) => self.on_compaction_finished(event), @@ -711,8 +813,16 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> drop(syncer); } StoreMsg::GcSnapshotFinish => self.register_snap_mgr_gc_tick(), + StoreMsg::AwakenRegions { abnormal_stores } => { + self.on_wake_up_regions(abnormal_stores); + } } } + self.ctx + .raft_metrics + .event_time + .store_msg + .observe(timer.saturating_elapsed_secs()); } fn start(&mut self, store: metapb::Store) { @@ -737,7 +847,6 @@ pub struct RaftPoller>, peer_msg_buf: Vec>, - previous_metrics: RaftReadyMetrics, timer: TiInstant, poll_ctx: PollContext, messages_per_tick: usize, @@ -745,12 +854,17 @@ pub struct RaftPoller RaftPoller { fn flush_events(&mut self) { self.flush_ticks(); - self.poll_ctx.raft_metrics.flush(); + self.poll_ctx.raft_metrics.maybe_flush(); self.poll_ctx.store_stat.flush(); MEMTRACE_PEERS.trace(mem::take(&mut self.trace_event)); @@ -759,21 +873,7 @@ impl RaftPoller { fn flush_ticks(&mut self) { for t in PeerTick::get_all_ticks() { let idx = *t as usize; - if self.poll_ctx.tick_batch[idx].ticks.is_empty() { - continue; - } - let peer_ticks = mem::take(&mut self.poll_ctx.tick_batch[idx].ticks); - let f = self - .poll_ctx - .timer - .delay(self.poll_ctx.tick_batch[idx].wait_duration) - .compat() - .map(move |_| { - for tick in peer_ticks { - tick(); - } - }); - poll_future_notify(f); + self.poll_ctx.tick_batch[idx].schedule(&self.poll_ctx.timer); } } } @@ -785,7 +885,10 @@ impl PollHandler, St where for<'a> F: FnOnce(&'a BatchSystemConfig), { - self.previous_metrics = self.poll_ctx.raft_metrics.ready.clone(); + fail_point!("begin_raft_poller"); + self.previous_append = self.poll_ctx.raft_metrics.ready.append.get(); + self.previous_message = self.poll_ctx.raft_metrics.ready.message.get(); + self.previous_snapshot = self.poll_ctx.raft_metrics.ready.snapshot.get(); self.poll_ctx.pending_count = 0; self.poll_ctx.ready_count = 0; self.poll_ctx.has_ready = false; @@ -817,6 +920,8 @@ impl PollHandler, St self.poll_ctx.update_ticks_timeout(); update_cfg(&incoming.store_batch_system); } + // update store writers if necessary + self.poll_ctx.write_senders.refresh(); } fn handle_control(&mut self, store: &mut StoreFsm) -> Option { @@ -892,7 +997,8 @@ impl PollHandler, St let mut delegate = PeerFsmDelegate::new(peer, &mut self.poll_ctx); delegate.handle_msgs(&mut self.peer_msg_buf); - // No readiness is generated and using sync write, skipping calling ready and release early. + // No readiness is generated and using sync write, skipping calling ready and + // release early. if !delegate.collect_ready() && self.poll_ctx.sync_write_worker.is_some() { if let HandleResult::StopAt { skip_end, .. } = &mut handle_result { *skip_end = true; @@ -961,13 +1067,20 @@ impl PollHandler, St } } } else { - let writer_id = rand::random::() % self.poll_ctx.cfg.store_io_pool_size; - if let Err(err) = - self.poll_ctx.write_senders[writer_id].try_send(WriteMsg::LatencyInspect { + // Use the valid size of async-ios for generating `writer_id` when the local + // senders haven't been updated by `poller.begin(). + let writer_id = rand::random::() + % std::cmp::min( + self.poll_ctx.cfg.store_io_pool_size, + self.poll_ctx.write_senders.size(), + ); + if let Err(err) = self.poll_ctx.write_senders[writer_id].try_send( + WriteMsg::LatencyInspect { send_time: write_begin, inspector: latency_inspect, - }) - { + }, + None, + ) { warn!("send latency inspecting to write workers failed"; "err" => ?err); } } @@ -994,17 +1107,20 @@ impl PollHandler, St .raft_metrics .ready .append - .saturating_sub(self.previous_metrics.append), + .get() + .saturating_sub(self.previous_append), self.poll_ctx .raft_metrics .ready .message - .saturating_sub(self.previous_metrics.message), + .get() + .saturating_sub(self.previous_message), self.poll_ctx .raft_metrics .ready .snapshot - .saturating_sub(self.previous_metrics.snapshot), + .get() + .saturating_sub(self.previous_snapshot), ); } @@ -1041,7 +1157,7 @@ pub struct RaftPollerBuilder { split_check_scheduler: Scheduler, cleanup_scheduler: Scheduler, raftlog_gc_scheduler: Scheduler, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, apply_router: ApplyRouter, pub router: RaftRouter, @@ -1055,8 +1171,7 @@ pub struct RaftPollerBuilder { pub engines: Engines, global_replication_state: Arc>, feature_gate: FeatureGate, - write_senders: Vec>>, - io_reschedule_concurrent_count: Arc, + write_senders: WriteSenders, } impl RaftPollerBuilder { @@ -1081,7 +1196,7 @@ impl RaftPollerBuilder { let mut merging_count = 0; let mut meta = self.store_meta.lock().unwrap(); let mut replication_state = self.global_replication_state.lock().unwrap(); - kv_engine.scan_cf(CF_RAFT, start_key, end_key, false, |key, value| { + kv_engine.scan(CF_RAFT, start_key, end_key, false, |key, value| { let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); if suffix != keys::REGION_STATE_SUFFIX { return Ok(true); @@ -1119,8 +1234,9 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), region, + local_state.get_state() == PeerState::Unavailable, )); - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { info!("region is merging"; "region" => ?region, "store_id" => store_id); merging_count += 1; @@ -1159,8 +1275,9 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), ®ion, + false, )?; - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); meta.region_ranges .insert(enc_end_key(®ion), region.get_id()); @@ -1215,8 +1332,14 @@ impl RaftPollerBuilder { last_start_key = keys::enc_end_key(region); } ranges.push((last_start_key, keys::DATA_MAX_KEY.to_vec())); + let ranges: Vec<_> = ranges + .iter() + .map(|(start, end)| Range::new(start, end)) + .collect(); - self.engines.kv.roughly_cleanup_ranges(&ranges)?; + self.engines + .kv + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &ranges)?; info!( "cleans up garbage data"; @@ -1239,11 +1362,12 @@ where fn build(&mut self, _: Priority) -> RaftPoller { let sync_write_worker = if self.write_senders.is_empty() { - let (_, rx) = unbounded(); + let (_, rx) = unbounded(None); Some(WriteWorker::new( self.store.get_id(), "sync-writer".to_string(), - self.engines.clone(), + self.engines.raft.clone(), + Some(self.engines.kv.clone()), rx, self.router.clone(), self.trans.clone(), @@ -1280,10 +1404,14 @@ where ready_count: 0, has_ready: false, current_time: None, - perf_context: self - .engines - .kv - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), + raft_perf_context: ER::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), + kv_perf_context: EK::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], node_start_time: Some(TiInstant::now_coarse()), feature_gate: self.feature_gate.clone(), @@ -1291,7 +1419,6 @@ where store_disk_usages: Default::default(), write_senders: self.write_senders.clone(), sync_write_worker, - io_reschedule_concurrent_count: self.io_reschedule_concurrent_count.clone(), pending_latency_inspect: vec![], }; ctx.update_ticks_timeout(); @@ -1300,7 +1427,6 @@ where tag: tag.clone(), store_msg_buf: Vec::with_capacity(ctx.cfg.messages_per_tick), peer_msg_buf: Vec::with_capacity(ctx.cfg.messages_per_tick), - previous_metrics: ctx.raft_metrics.ready.clone(), timer: TiInstant::now(), messages_per_tick: ctx.cfg.messages_per_tick, poll_ctx: ctx, @@ -1308,6 +1434,9 @@ where trace_event: TraceEvent::default(), last_flush_time: TiInstant::now(), need_flush_events: false, + previous_append: 0, + previous_message: 0, + previous_snapshot: 0, } } } @@ -1342,7 +1471,6 @@ where global_replication_state: self.global_replication_state.clone(), feature_gate: self.feature_gate.clone(), write_senders: self.write_senders.clone(), - io_reschedule_concurrent_count: self.io_reschedule_concurrent_count.clone(), } } } @@ -1356,9 +1484,9 @@ struct Workers { // blocking operation, which can take an extensive amount of time. cleanup_worker: Worker, region_worker: Worker, - // Used for calling `purge_expired_files`, which can be time-consuming for certain - // engine implementations. - purge_worker: Worker, + // Used for calling `manual_purge` if the specific engine implementation requires it + // (`need_manual_purge`). + purge_worker: Option, raftlog_fetch_worker: Worker, @@ -1405,7 +1533,7 @@ impl RaftBatchSystem { mgr: SnapManager, pd_worker: LazyWorker>, store_meta: Arc>, - mut coprocessor_host: CoprocessorHost, + coprocessor_host: CoprocessorHost, importer: Arc, split_check_scheduler: Scheduler, background_worker: Worker, @@ -1414,21 +1542,43 @@ impl RaftBatchSystem { concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, health_service: Option, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. - - // TODO load coprocessors from configuration - coprocessor_host - .registry - .register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); + let purge_worker = if engines.raft.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { + let worker = Worker::new("purge-worker"); + let raft_clone = engines.raft.clone(); + let router_clone = self.router(); + worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + let _guard = WithIoType::new(IoType::RewriteLog); + match raft_clone.manual_purge() { + Ok(regions) => { + for region_id in regions { + let _ = router_clone.send( + region_id, + PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), + ); + } + } + Err(e) => { + warn!("purge expired files"; "err" => %e); + } + }; + }); + Some(worker) + } else { + None + }; let workers = Workers { pd_worker, background_worker, cleanup_worker: Worker::new("cleanup-worker"), region_worker: Worker::new("region-worker"), - purge_worker: Worker::new("purge-worker"), + purge_worker, raftlog_fetch_worker: Worker::new("raftlog-fetch-worker"), coprocessor_host: coprocessor_host.clone(), refresh_config_worker: LazyWorker::new("refreash-config-worker"), @@ -1437,9 +1587,7 @@ impl RaftBatchSystem { let region_runner = RegionRunner::new( engines.kv.clone(), mgr.clone(), - cfg.value().snap_apply_batch_size.0 as usize, - cfg.value().use_delete_range, - cfg.value().snap_generator_pool_size, + cfg.clone(), workers.coprocessor_host.clone(), self.router(), Some(Arc::clone(&pd_client)), @@ -1455,30 +1603,10 @@ impl RaftBatchSystem { let raftlog_gc_scheduler = workers .background_worker .start_with_timer("raft-gc-worker", raftlog_gc_runner); - let router_clone = self.router(); - let engines_clone = engines.clone(); - workers.purge_worker.spawn_interval_task( - cfg.value().raft_engine_purge_interval.0, - move || { - match engines_clone.raft.purge_expired_files() { - Ok(regions) => { - for region_id in regions { - let _ = router_clone.send( - region_id, - PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), - ); - } - } - Err(e) => { - warn!("purge expired files"; "err" => %e); - } - }; - }, - ); let raftlog_fetch_scheduler = workers.raftlog_fetch_worker.start( "raftlog-fetch-worker", - RaftlogFetchRunner::new(self.router.clone(), engines.raft.clone()), + ReadRunner::new(self.router.clone(), engines.raft.clone()), ); let compact_runner = CompactRunner::new(engines.kv.clone()); @@ -1504,8 +1632,14 @@ impl RaftBatchSystem { .background_worker .start("consistency-check", consistency_check_runner); - self.store_writers - .spawn(meta.get_id(), &engines, &self.router, &trans, &cfg)?; + self.store_writers.spawn( + meta.get_id(), + engines.raft.clone(), + Some(engines.kv.clone()), + &self.router, + &trans, + &cfg, + )?; let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); let mut builder = RaftPollerBuilder { @@ -1530,8 +1664,7 @@ impl RaftBatchSystem { store_meta, pending_create_peers: Arc::new(Mutex::new(HashMap::default())), feature_gate: pd_client.feature_gate().clone(), - write_senders: self.store_writers.senders().clone(), - io_reschedule_concurrent_count: Arc::new(AtomicUsize::new(0)), + write_senders: self.store_writers.senders(), }; let region_peers = builder.init()?; self.start_system::( @@ -1545,6 +1678,7 @@ impl RaftBatchSystem { collector_reg_handle, region_read_progress, health_service, + causal_ts_provider, )?; Ok(()) } @@ -1561,6 +1695,7 @@ impl RaftBatchSystem { collector_reg_handle: CollectorRegHandle, region_read_progress: RegionReadProgressRegistry, health_service: Option, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> { let cfg = builder.cfg.value().clone(); let store = builder.store.clone(); @@ -1593,6 +1728,7 @@ impl RaftBatchSystem { let (raft_builder, apply_builder) = (builder.clone(), apply_poller_builder.clone()); let tag = format!("raftstore-{}", store.get_id()); + let coprocessor_host = builder.coprocessor_host.clone(); self.system.spawn(tag, builder); let mut mailboxes = Vec::with_capacity(region_peers.len()); let mut address = Vec::with_capacity(region_peers.len()); @@ -1619,6 +1755,15 @@ impl RaftBatchSystem { .spawn("apply".to_owned(), apply_poller_builder); let refresh_config_runner = RefreshConfigRunner::new( + StoreWritersContext { + store_id: store.get_id(), + notifier: self.router.clone(), + raft_engine: raft_builder.engines.raft.clone(), + kv_engine: Some(raft_builder.engines.kv.clone()), + transfer: raft_builder.trans.clone(), + cfg: raft_builder.cfg.clone(), + }, + self.store_writers.clone(), self.apply_router.router.clone(), self.router.router.clone(), self.apply_system.build_pool_state(apply_builder), @@ -1632,7 +1777,6 @@ impl RaftBatchSystem { Arc::clone(&pd_client), self.router.clone(), workers.pd_worker.scheduler(), - cfg.pd_store_heartbeat_tick_interval.0, auto_split_controller, concurrency_manager, snap_mgr, @@ -1640,6 +1784,8 @@ impl RaftBatchSystem { collector_reg_handle, region_read_progress, health_service, + coprocessor_host, + causal_ts_provider, ); assert!(workers.pd_worker.start_with_timer(pd_runner)); @@ -1675,7 +1821,9 @@ impl RaftBatchSystem { workers.cleanup_worker.stop(); workers.region_worker.stop(); workers.background_worker.stop(); - workers.purge_worker.stop(); + if let Some(w) = workers.purge_worker { + w.stop(); + } workers.refresh_config_worker.stop(); workers.raftlog_fetch_worker.stop(); } @@ -1683,11 +1831,21 @@ impl RaftBatchSystem { pub fn create_raft_batch_system( cfg: &Config, + resource_manager: &Option>, ) -> (RaftRouter, RaftBatchSystem) { let (store_tx, store_fsm) = StoreFsm::new(cfg); - let (apply_router, apply_system) = create_apply_batch_system(cfg); - let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + let (apply_router, apply_system) = create_apply_batch_system( + cfg, + resource_manager + .as_ref() + .map(|m| m.derive_controller("apply".to_owned(), false)), + ); + let (router, system) = batch_system::create_system( + &cfg.store_batch_system, + store_tx, + store_fsm, + None, // Do not do priority scheduling for store batch system + ); let raft_router = RaftRouter { router }; let system = RaftBatchSystem { system, @@ -1695,7 +1853,11 @@ pub fn create_raft_batch_system( apply_router, apply_system, router: raft_router.clone(), - store_writers: StoreWriters::new(), + store_writers: StoreWriters::new( + resource_manager + .as_ref() + .map(|m| m.derive_controller("store-writer".to_owned(), false)), + ), }; (raft_router, system) } @@ -1732,12 +1894,15 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER if local_state.get_state() != PeerState::Tombstone { // Maybe split, but not registered yet. if !util::is_first_message(msg.get_message()) { - self.ctx.raft_metrics.message_dropped.region_nonexistent += 1; - return Err(box_err!( - "[region {}] region not exist but not tombstone: {:?}", + self.ctx + .raft_metrics + .message_dropped + .region_nonexistent + .inc(); + return Err(Error::RegionNotRegistered { region_id, - local_state - )); + local_state, + }); } info!( "region doesn't exist yet, wait for it to be split"; @@ -1760,7 +1925,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "msg_type" => ?msg_type, ); - let merge_target = if let Some(peer) = util::find_peer(region, from_store_id) { + let merge_target = if let Some(peer) = find_peer(region, from_store_id) { // Maybe the target is promoted from learner to voter, but the follower // doesn't know it. So we only compare peer id. if peer.get_id() < msg.get_from_peer().get_id() { @@ -1785,7 +1950,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } // The region in this peer is already destroyed if util::is_epoch_stale(from_epoch, region_epoch) { - self.ctx.raft_metrics.message_dropped.region_tombstone_peer += 1; + self.ctx + .raft_metrics + .message_dropped + .region_tombstone_peer + .inc(); info!( "tombstone peer receives a stale message"; "region_id" => region_id, @@ -1793,13 +1962,13 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "current_region_epoch" => ?region_epoch, "msg_type" => ?msg_type, ); - if util::find_peer(region, from_store_id).is_none() { + if find_peer(region, from_store_id).is_none() { self.ctx.handle_stale_msg(msg, region_epoch.clone(), None); } else { let mut need_gc_msg = util::is_vote_msg(msg.get_message()); if msg.has_extra_msg() { - // A learner can't vote so it sends the check-stale-peer msg to others to find out whether - // it is removed due to conf change or merge. + // A learner can't vote so it sends the check-stale-peer msg to others to find + // out whether it is removed due to conf change or merge. need_gc_msg |= msg.get_extra_msg().get_type() == ExtraMessageType::MsgCheckStalePeer; // For backward compatibility @@ -1827,13 +1996,16 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER return Ok(CheckMsgStatus::DropMsg); } // A tombstone peer may not apply the conf change log which removes itself. - // In this case, the local epoch is stale and the local peer can be found from region. - // We can compare the local peer id with to_peer_id to verify whether it is correct to create a new peer. - if let Some(local_peer_id) = - util::find_peer(region, self.ctx.store_id()).map(|r| r.get_id()) - { + // In this case, the local epoch is stale and the local peer can be found from + // region. We can compare the local peer id with to_peer_id to verify whether it + // is correct to create a new peer. + if let Some(local_peer_id) = find_peer(region, self.ctx.store_id()).map(|r| r.get_id()) { if to_peer_id <= local_peer_id { - self.ctx.raft_metrics.message_dropped.region_tombstone_peer += 1; + self.ctx + .raft_metrics + .message_dropped + .region_tombstone_peer + .inc(); info!( "tombstone peer receives a stale message, local_peer_id >= to_peer_id in msg"; "region_id" => region_id, @@ -1881,7 +2053,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "to_store_id" => msg.get_to_peer().get_store_id(), "region_id" => region_id, ); - self.ctx.raft_metrics.message_dropped.mismatch_store_id += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_store_id + .inc(); return Ok(()); } @@ -1890,9 +2066,30 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "missing epoch in raft message, ignore it"; "region_id" => region_id, ); - self.ctx.raft_metrics.message_dropped.mismatch_region_epoch += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_region_epoch + .inc(); + return Ok(()); + } + + // To make learner (e.g. tiflash engine) compatiable with raftstore v2, + // it needs to response GcPeerResponse. + if msg.get_is_tombstone() && self.ctx.cfg.enable_v2_compatible_learner { + if let Some(msg) = + handle_tombstone_message_on_learner(&self.ctx.engines.kv, self.fsm.store.id, msg) + { + let _ = self.ctx.trans.send(msg); + } + // else { + // TODO: we should create the peer and destroy immediately to leave + // a tombstone record, otherwise it leaks removed_record + // and merged_record. + // } return Ok(()); } + if msg.get_is_tombstone() || msg.has_merge_target() { // Target tombstone peer doesn't exist, so ignore it. return Ok(()); @@ -1960,7 +2157,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "region_id" => region_id, "msg_type" => ?msg_type, ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(false); } @@ -1973,7 +2170,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } let res = self.maybe_create_peer_internal(region_id, msg, is_local_first); - // If failed, i.e. Err or Ok(false), remove this peer data from `pending_create_peers`. + // If failed, i.e. Err or Ok(false), remove this peer data from + // `pending_create_peers`. if res.as_ref().map_or(true, |b| !*b) && is_local_first { let mut pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); if let Some(status) = pending_create_peers.get(®ion_id) { @@ -2014,13 +2212,16 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); match pending_create_peers.get(®ion_id) { Some(status) if *status == (msg.get_to_peer().get_id(), false) => (), - // If changed, it means this peer has been/will be replaced from the new one from splitting. + // If changed, it means this peer has been/will be replaced from the new one from + // splitting. _ => return Ok(false), } - // Note that `StoreMeta` lock is held and status is (peer_id, false) in `pending_create_peers` now. - // If this peer is created from splitting latter and then status in `pending_create_peers` is changed, - // that peer creation in `on_ready_split_region` must be executed **after** current peer creation - // because of the `StoreMeta` lock. + // Note that `StoreMeta` lock is held and status is (peer_id, false) + // in `pending_create_peers` now. If this peer is created from + // splitting latter and then status in `pending_create_peers` is + // changed, that peer creation in `on_ready_split_region` must be + // executed **after** current peer creation because of the + // `StoreMeta` lock. } if meta.overlap_damaged_range( @@ -2089,8 +2290,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER is_overlapped = true; if msg.get_region_epoch().get_version() > exist_region.get_region_epoch().get_version() { - // If new region's epoch version is greater than exist region's, the exist region - // may has been merged/splitted already. + // If new region's epoch version is greater than exist region's, the exist + // region may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), @@ -2099,7 +2300,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } if is_overlapped { - self.ctx.raft_metrics.message_dropped.region_overlap += 1; + self.ctx.raft_metrics.message_dropped.region_overlap.inc(); return Ok(false); } @@ -2132,7 +2333,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER // Now all checking passed let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); drop(replication_state); peer.peer.local_first_replicate = is_local_first; @@ -2294,6 +2495,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let snap_stats = self.ctx.snap_mgr.stats(); stats.set_sending_snap_count(snap_stats.sending_count as u32); stats.set_receiving_snap_count(snap_stats.receiving_count as u32); + stats.set_snapshot_stats(snap_stats.stats.into()); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC .with_label_values(&["sending"]) .set(snap_stats.sending_count as i64); @@ -2309,14 +2512,14 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .global_stat .stat .engine_total_bytes_written - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); stats.set_keys_written( self.ctx .global_stat .stat .engine_total_keys_written - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); stats.set_is_busy( @@ -2324,7 +2527,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .global_stat .stat .is_busy - .swap(false, Ordering::SeqCst), + .swap(false, Ordering::Relaxed), ); let mut query_stats = QueryStats::default(); @@ -2333,29 +2536,29 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .global_stat .stat .engine_total_query_put - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); query_stats.set_delete( self.ctx .global_stat .stat .engine_total_query_delete - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); query_stats.set_delete_range( self.ctx .global_stat .stat .engine_total_query_delete_range - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); stats.set_query_stats(query_stats); - let store_info = StoreInfo { + let store_info = Some(StoreInfo { kv_engine: self.ctx.engines.kv.clone(), raft_engine: self.ctx.engines.raft.clone(), capacity: self.ctx.cfg.capacity.0, - }; + }); let task = PdTask::StoreHeartbeat { stats, @@ -2438,6 +2641,45 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.register_compact_lock_cf_tick(); } + fn on_wake_up_regions(&self, abnormal_stores: Vec) { + info!("try to wake up all hibernated regions in this store"; + "to_all" => abnormal_stores.is_empty()); + let meta = self.ctx.store_meta.lock().unwrap(); + for region_id in meta.regions.keys() { + let region = &meta.regions[region_id]; + // Check whether the current region is not found on abnormal stores. If so, + // this region is not the target to be awaken. + if !region_on_stores(region, &abnormal_stores) { + continue; + } + let peer = { + match find_peer(region, self.ctx.store_id()) { + None => continue, + Some(p) => p.clone(), + } + }; + { + // Send MsgRegionWakeUp to Peer for awakening hibernated regions. + let mut message = RaftMessage::default(); + message.set_region_id(*region_id); + message.set_from_peer(peer.clone()); + message.set_to_peer(peer); + message.set_region_epoch(region.get_region_epoch().clone()); + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgRegionWakeUp); + msg.forcely_awaken = true; + message.set_extra_msg(msg); + if let Err(e) = self.ctx.router.send_raft_message(message) { + error!( + "send awaken region message failed"; + "region_id" => region_id, + "err" => ?e + ); + } + } + } + } + fn register_pd_store_heartbeat_tick(&self) { self.ctx.schedule_store_tick( StoreTick::PdStoreHeartbeat, @@ -2531,9 +2773,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - // When there is an import job running, the region which this sst belongs may has not been - // split from the origin region because the apply thread is so busy that it can not apply - // SplitRequest as soon as possible. So we can not delete this sst file. + // When there is an import job running, the region which this sst belongs may + // has not been split from the origin region because the apply thread is so busy + // that it can not apply SplitRequest as soon as possible. So we can not + // delete this sst file. if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { let task = CleanupSstTask::ValidateSst { ssts: validate_ssts, @@ -2586,7 +2829,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER if target_region_id == 0 { return; } - match util::find_peer(&meta.regions[&target_region_id], self.ctx.store_id()) { + match find_peer(&meta.regions[&target_region_id], self.ctx.store_id()) { None => return, Some(p) => p.clone(), } @@ -2655,22 +2898,36 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER fn on_store_unreachable(&mut self, store_id: u64) { let now = Instant::now(); - if self - .fsm - .store - .last_unreachable_report - .get(&store_id) - .map_or(UNREACHABLE_BACKOFF, |t| now.saturating_duration_since(*t)) - < UNREACHABLE_BACKOFF - { - return; - } + let unreachable_backoff = self.ctx.cfg.unreachable_backoff.0; + let new_messages = MESSAGE_RECV_BY_STORE + .with_label_values(&[&format!("{}", store_id)]) + .get(); + match self.fsm.store.store_reachability.entry(store_id) { + HashMapEntry::Vacant(x) => { + x.insert(StoreReachability { + last_broadcast: now, + received_message_count: new_messages, + }); + } + HashMapEntry::Occupied(x) => { + let ob = x.into_mut(); + if now.saturating_duration_since(ob.last_broadcast) < unreachable_backoff + // If there are no new messages come from `store_id`, it's not + // necessary to do redundant broadcasts. + || (new_messages <= ob.received_message_count && new_messages > 0) + { + return; + } + ob.last_broadcast = now; + ob.received_message_count = new_messages; + } + }; + info!( "broadcasting unreachable"; "store_id" => self.fsm.store.id, "unreachable_store_id" => store_id, ); - self.fsm.store.last_unreachable_report.insert(store_id, now); // It's possible to acquire the lock and only send notification to // involved regions. However loop over all the regions can take a // lot of time, which may block other operations. @@ -2740,6 +2997,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), ®ion, + false, ) { Ok((sender, peer)) => (sender, peer), Err(e) => { @@ -2752,7 +3010,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); drop(replication_state); peer.peer.activate(self.ctx); @@ -2782,7 +3040,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } drop(meta); - if let Err(e) = self.ctx.engines.kv.delete_all_in_range( + if let Err(e) = self.ctx.engines.kv.delete_ranges_cfs( DeleteStrategy::DeleteByKey, &[Range::new(&start_key, &end_key)], ) { diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index d6e6dc265bc..baf63814416 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -6,111 +6,51 @@ use std::sync::{Arc, Mutex}; use collections::HashSet; use prometheus::local::LocalHistogram; use raft::eraftpb::MessageType; +use tikv_util::time::{Duration, Instant}; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; use super::metrics::*; -/// The buffered metrics counters for raft ready handling. -#[derive(Debug, Default, Clone)] -pub struct RaftReadyMetrics { - pub message: u64, - pub commit: u64, - pub append: u64, - pub snapshot: u64, - pub pending_region: u64, - pub has_ready_region: u64, -} - -impl RaftReadyMetrics { - /// Flushes all metrics - fn flush(&mut self) { - // reset all buffered metrics once they have been added - if self.message > 0 { - STORE_RAFT_READY_COUNTER.message.inc_by(self.message); - self.message = 0; - } - if self.commit > 0 { - STORE_RAFT_READY_COUNTER.commit.inc_by(self.commit); - self.commit = 0; - } - if self.append > 0 { - STORE_RAFT_READY_COUNTER.append.inc_by(self.append); - self.append = 0; - } - if self.snapshot > 0 { - STORE_RAFT_READY_COUNTER.snapshot.inc_by(self.snapshot); - self.snapshot = 0; - } - if self.pending_region > 0 { - STORE_RAFT_READY_COUNTER - .pending_region - .inc_by(self.pending_region); - self.pending_region = 0; - } - if self.has_ready_region > 0 { - STORE_RAFT_READY_COUNTER - .has_ready_region - .inc_by(self.has_ready_region); - self.has_ready_region = 0; - } - } -} +const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s -pub type SendStatus = [u64; 2]; - -macro_rules! flush_send_status { - ($metrics:ident, $self:ident) => {{ - if $self.$metrics[0] > 0 { - STORE_RAFT_SENT_MESSAGE_COUNTER - .$metrics - .drop - .inc_by($self.$metrics[0]); - $self.$metrics[0] = 0; - } - if $self.$metrics[1] > 0 { - STORE_RAFT_SENT_MESSAGE_COUNTER - .$metrics - .accept - .inc_by($self.$metrics[1]); - $self.$metrics[1] = 0; +macro_rules! set_send_status { + ($metrics:expr, $success:ident) => {{ + if $success { + $metrics.accept.inc(); + } else { + $metrics.drop.inc(); } }}; } -/// The buffered metrics counters for raft message. -#[derive(Debug, Default, Clone)] -pub struct RaftSendMessageMetrics { - pub append: SendStatus, - pub append_resp: SendStatus, - pub prevote: SendStatus, - pub prevote_resp: SendStatus, - pub vote: SendStatus, - pub vote_resp: SendStatus, - pub snapshot: SendStatus, - pub heartbeat: SendStatus, - pub heartbeat_resp: SendStatus, - pub transfer_leader: SendStatus, - pub timeout_now: SendStatus, - pub read_index: SendStatus, - pub read_index_resp: SendStatus, +pub struct RaftSendMessageMetrics(RaftSentMessageCounterVec); + +impl Default for RaftSendMessageMetrics { + fn default() -> Self { + Self(RaftSentMessageCounterVec::from( + &STORE_RAFT_SENT_MESSAGE_COUNTER_VEC, + )) + } } impl RaftSendMessageMetrics { pub fn add(&mut self, msg_type: MessageType, success: bool) { - let i = success as usize; match msg_type { - MessageType::MsgAppend => self.append[i] += 1, - MessageType::MsgAppendResponse => self.append_resp[i] += 1, - MessageType::MsgRequestPreVote => self.prevote[i] += 1, - MessageType::MsgRequestPreVoteResponse => self.prevote_resp[i] += 1, - MessageType::MsgRequestVote => self.vote[i] += 1, - MessageType::MsgRequestVoteResponse => self.vote_resp[i] += 1, - MessageType::MsgSnapshot => self.snapshot[i] += 1, - MessageType::MsgHeartbeat => self.heartbeat[i] += 1, - MessageType::MsgHeartbeatResponse => self.heartbeat_resp[i] += 1, - MessageType::MsgTransferLeader => self.transfer_leader[i] += 1, - MessageType::MsgReadIndex => self.read_index[i] += 1, - MessageType::MsgReadIndexResp => self.read_index_resp[i] += 1, - MessageType::MsgTimeoutNow => self.timeout_now[i] += 1, + MessageType::MsgAppend => set_send_status!(self.0.append, success), + MessageType::MsgAppendResponse => set_send_status!(self.0.append_resp, success), + MessageType::MsgRequestPreVote => set_send_status!(self.0.prevote, success), + MessageType::MsgRequestPreVoteResponse => { + set_send_status!(self.0.prevote_resp, success) + } + MessageType::MsgRequestVote => set_send_status!(self.0.vote, success), + MessageType::MsgRequestVoteResponse => set_send_status!(self.0.vote_resp, success), + MessageType::MsgSnapshot => set_send_status!(self.0.snapshot, success), + MessageType::MsgHeartbeat => set_send_status!(self.0.heartbeat, success), + MessageType::MsgHeartbeatResponse => set_send_status!(self.0.heartbeat_resp, success), + MessageType::MsgTransferLeader => set_send_status!(self.0.transfer_leader, success), + MessageType::MsgReadIndex => set_send_status!(self.0.read_index, success), + MessageType::MsgReadIndexResp => set_send_status!(self.0.read_index_resp, success), + MessageType::MsgTimeoutNow => set_send_status!(self.0.timeout_now, success), // We do not care about these message types for metrics. // Explicitly declare them so when we add new message types we are forced to // decide. @@ -122,346 +62,114 @@ impl RaftSendMessageMetrics { | MessageType::MsgCheckQuorum => {} } } - /// Flushes all metrics - pub fn flush(&mut self) { - // reset all buffered metrics once they have been added - flush_send_status!(append, self); - flush_send_status!(append_resp, self); - flush_send_status!(prevote, self); - flush_send_status!(prevote_resp, self); - flush_send_status!(vote, self); - flush_send_status!(vote_resp, self); - flush_send_status!(snapshot, self); - flush_send_status!(heartbeat, self); - flush_send_status!(heartbeat_resp, self); - flush_send_status!(transfer_leader, self); - flush_send_status!(timeout_now, self); - flush_send_status!(read_index, self); - flush_send_status!(read_index_resp, self); - } -} - -#[derive(Debug, Default, Clone)] -pub struct RaftMessageDropMetrics { - pub mismatch_store_id: u64, - pub mismatch_region_epoch: u64, - pub stale_msg: u64, - pub region_overlap: u64, - pub region_no_peer: u64, - pub region_tombstone_peer: u64, - pub region_nonexistent: u64, - pub applying_snap: u64, - pub disk_full: u64, -} - -impl RaftMessageDropMetrics { - fn flush(&mut self) { - if self.mismatch_store_id > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .mismatch_store_id - .inc_by(self.mismatch_store_id); - self.mismatch_store_id = 0; - } - if self.mismatch_region_epoch > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .mismatch_region_epoch - .inc_by(self.mismatch_region_epoch); - self.mismatch_region_epoch = 0; - } - if self.stale_msg > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .stale_msg - .inc_by(self.stale_msg); - self.stale_msg = 0; - } - if self.region_overlap > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_overlap - .inc_by(self.region_overlap); - self.region_overlap = 0; - } - if self.region_no_peer > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_no_peer - .inc_by(self.region_no_peer); - self.region_no_peer = 0; - } - if self.region_tombstone_peer > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_tombstone_peer - .inc_by(self.region_tombstone_peer); - self.region_tombstone_peer = 0; - } - if self.region_nonexistent > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_nonexistent - .inc_by(self.region_nonexistent); - self.region_nonexistent = 0; - } - if self.applying_snap > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .applying_snap - .inc_by(self.applying_snap); - self.applying_snap = 0; - } - if self.disk_full > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .disk_full - .inc_by(self.disk_full); - self.disk_full = 0; - } - } -} - -/// The buffered metrics counters for raft propose. -#[derive(Clone)] -pub struct RaftProposeMetrics { - pub all: u64, - pub local_read: u64, - pub read_index: u64, - pub unsafe_read_index: u64, - pub dropped_read_index: u64, - pub normal: u64, - pub batch: usize, - pub transfer_leader: u64, - pub conf_change: u64, - pub request_wait_time: LocalHistogram, -} - -impl Default for RaftProposeMetrics { - fn default() -> RaftProposeMetrics { - RaftProposeMetrics { - all: 0, - local_read: 0, - read_index: 0, - unsafe_read_index: 0, - normal: 0, - transfer_leader: 0, - conf_change: 0, - batch: 0, - dropped_read_index: 0, - request_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), - } - } -} - -impl RaftProposeMetrics { - /// Flushes all metrics - fn flush(&mut self) { - // reset all buffered metrics once they have been added - if self.all > 0 { - PEER_PROPOSAL_COUNTER.all.inc_by(self.all); - self.all = 0; - } - if self.local_read > 0 { - PEER_PROPOSAL_COUNTER.local_read.inc_by(self.local_read); - self.local_read = 0; - } - if self.read_index > 0 { - PEER_PROPOSAL_COUNTER.read_index.inc_by(self.read_index); - self.read_index = 0; - } - if self.unsafe_read_index > 0 { - PEER_PROPOSAL_COUNTER - .unsafe_read_index - .inc_by(self.unsafe_read_index); - self.unsafe_read_index = 0; - } - if self.dropped_read_index > 0 { - PEER_PROPOSAL_COUNTER - .dropped_read_index - .inc_by(self.dropped_read_index); - self.dropped_read_index = 0; - } - if self.normal > 0 { - PEER_PROPOSAL_COUNTER.normal.inc_by(self.normal); - self.normal = 0; - } - if self.transfer_leader > 0 { - PEER_PROPOSAL_COUNTER - .transfer_leader - .inc_by(self.transfer_leader); - self.transfer_leader = 0; - } - if self.conf_change > 0 { - PEER_PROPOSAL_COUNTER.conf_change.inc_by(self.conf_change); - self.conf_change = 0; - } - if self.batch > 0 { - PEER_PROPOSAL_COUNTER.batch.inc_by(self.batch as u64); - self.batch = 0; - } - self.request_wait_time.flush(); - } -} - -/// The buffered metrics counter for invalid propose -#[derive(Clone, Default)] -pub struct RaftInvalidProposeMetrics { - pub mismatch_store_id: u64, - pub region_not_found: u64, - pub not_leader: u64, - pub mismatch_peer_id: u64, - pub stale_command: u64, - pub epoch_not_match: u64, - pub read_index_no_leader: u64, - pub region_not_initialized: u64, - pub is_applying_snapshot: u64, -} - -impl RaftInvalidProposeMetrics { - fn flush(&mut self) { - if self.mismatch_store_id > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .mismatch_store_id - .inc_by(self.mismatch_store_id); - self.mismatch_store_id = 0; - } - if self.region_not_found > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .region_not_found - .inc_by(self.region_not_found); - self.region_not_found = 0; - } - if self.not_leader > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .not_leader - .inc_by(self.not_leader); - self.not_leader = 0; - } - if self.mismatch_peer_id > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .mismatch_peer_id - .inc_by(self.mismatch_peer_id); - self.mismatch_peer_id = 0; - } - if self.stale_command > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .stale_command - .inc_by(self.stale_command); - self.stale_command = 0; - } - if self.epoch_not_match > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .epoch_not_match - .inc_by(self.epoch_not_match); - self.epoch_not_match = 0; - } - if self.read_index_no_leader > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .read_index_no_leader - .inc_by(self.read_index_no_leader); - self.read_index_no_leader = 0; - } - if self.region_not_initialized > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .region_not_initialized - .inc_by(self.region_not_initialized); - self.region_not_initialized = 0; - } - if self.is_applying_snapshot > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .is_applying_snapshot - .inc_by(self.is_applying_snapshot); - self.is_applying_snapshot = 0; - } - } -} -#[derive(Clone, Default)] -pub struct RaftLogGcSkippedMetrics { - pub reserve_log: u64, - pub threshold_limit: u64, - pub compact_idx_too_small: u64, -} - -impl RaftLogGcSkippedMetrics { - fn flush(&mut self) { - if self.reserve_log > 0 { - RAFT_LOG_GC_SKIPPED.reserve_log.inc_by(self.reserve_log); - self.reserve_log = 0; - } - if self.threshold_limit > 0 { - RAFT_LOG_GC_SKIPPED - .threshold_limit - .inc_by(self.threshold_limit); - self.threshold_limit = 0; - } - if self.compact_idx_too_small > 0 { - RAFT_LOG_GC_SKIPPED - .compact_idx_too_small - .inc_by(self.compact_idx_too_small); - self.compact_idx_too_small = 0; - } + pub fn flush(&mut self) { + self.0.flush(); } } /// The buffered metrics counters for raft. -#[derive(Clone)] pub struct RaftMetrics { - pub store_time: LocalHistogram, - pub ready: RaftReadyMetrics, + // local counter + pub ready: RaftReadyCounterVec, pub send_message: RaftSendMessageMetrics, - pub message_dropped: RaftMessageDropMetrics, - pub propose: RaftProposeMetrics, + pub message_dropped: RaftDroppedMessageCounterVec, + pub propose: RaftProposalCounterVec, + pub invalid_proposal: RaftInvalidProposalCounterVec, + pub raft_log_gc_skipped: RaftLogGcSkippedCounterVec, + + // local histogram + pub store_time: LocalHistogram, + pub propose_wait_time: LocalHistogram, pub process_ready: LocalHistogram, + pub event_time: RaftEventDurationVec, + pub peer_msg_len: LocalHistogram, pub commit_log: LocalHistogram, - pub leader_missing: Arc>>, - pub invalid_proposal: RaftInvalidProposeMetrics, pub write_block_wait: LocalHistogram, + pub propose_log_size: LocalHistogram, + + // waterfall metrics pub waterfall_metrics: bool, pub wf_batch_wait: LocalHistogram, pub wf_send_to_queue: LocalHistogram, + pub wf_send_proposal: LocalHistogram, pub wf_persist_log: LocalHistogram, pub wf_commit_log: LocalHistogram, pub wf_commit_not_persist_log: LocalHistogram, - pub raft_log_gc_skipped: RaftLogGcSkippedMetrics, + + pub leader_missing: Arc>>, + + last_flush_time: Instant, } impl RaftMetrics { pub fn new(waterfall_metrics: bool) -> Self { Self { + ready: RaftReadyCounterVec::from(&STORE_RAFT_READY_COUNTER_VEC), + send_message: RaftSendMessageMetrics::default(), + message_dropped: RaftDroppedMessageCounterVec::from( + &STORE_RAFT_DROPPED_MESSAGE_COUNTER_VEC, + ), + propose: RaftProposalCounterVec::from(&PEER_PROPOSAL_COUNTER_VEC), + invalid_proposal: RaftInvalidProposalCounterVec::from( + &RAFT_INVALID_PROPOSAL_COUNTER_VEC, + ), + raft_log_gc_skipped: RaftLogGcSkippedCounterVec::from(&RAFT_LOG_GC_SKIPPED_VEC), store_time: STORE_TIME_HISTOGRAM.local(), - ready: Default::default(), - send_message: Default::default(), - message_dropped: Default::default(), - propose: Default::default(), + propose_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), + event_time: RaftEventDurationVec::from(&RAFT_EVENT_DURATION_VEC), + peer_msg_len: PEER_MSG_LEN.local(), commit_log: PEER_COMMIT_LOG_HISTOGRAM.local(), - leader_missing: Arc::default(), - invalid_proposal: Default::default(), write_block_wait: STORE_WRITE_MSG_BLOCK_WAIT_DURATION_HISTOGRAM.local(), + propose_log_size: PEER_PROPOSE_LOG_SIZE_HISTOGRAM.local(), waterfall_metrics, wf_batch_wait: STORE_WF_BATCH_WAIT_DURATION_HISTOGRAM.local(), wf_send_to_queue: STORE_WF_SEND_TO_QUEUE_DURATION_HISTOGRAM.local(), + wf_send_proposal: STORE_WF_SEND_PROPOSAL_DURATION_HISTOGRAM.local(), wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), - raft_log_gc_skipped: RaftLogGcSkippedMetrics::default(), + leader_missing: Arc::default(), + last_flush_time: Instant::now_coarse(), } } - /// Flushs all metrics - pub fn flush(&mut self) { - self.store_time.flush(); + /// Flushes all metrics + pub fn maybe_flush(&mut self) { + if self.last_flush_time.saturating_elapsed() < Duration::from_millis(METRICS_FLUSH_INTERVAL) + { + return; + } + self.last_flush_time = Instant::now_coarse(); + self.ready.flush(); self.send_message.flush(); + self.message_dropped.flush(); self.propose.flush(); + self.invalid_proposal.flush(); + self.raft_log_gc_skipped.flush(); + + self.store_time.flush(); + self.propose_wait_time.flush(); self.process_ready.flush(); + self.event_time.flush(); + self.peer_msg_len.flush(); self.commit_log.flush(); - self.message_dropped.flush(); - self.invalid_proposal.flush(); self.write_block_wait.flush(); - self.raft_log_gc_skipped.flush(); + self.propose_log_size.flush(); + if self.waterfall_metrics { self.wf_batch_wait.flush(); self.wf_send_to_queue.flush(); + self.wf_send_proposal.flush(); self.wf_persist_log.flush(); self.wf_commit_log.flush(); self.wf_commit_not_persist_log.flush(); } + let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); missing.clear(); @@ -496,3 +204,65 @@ impl StoreWriteMetrics { } } } + +/// Tracker for the durations of a raftstore request. +/// If a global tracker is not available, it will fallback to an Instant. +#[derive(Debug, Clone, Copy)] +pub struct TimeTracker { + token: TrackerToken, + start: std::time::Instant, +} + +impl Default for TimeTracker { + #[inline] + fn default() -> Self { + let token = tracker::get_tls_tracker_token(); + let start = std::time::Instant::now(); + let tracker = TimeTracker { token, start }; + if token == INVALID_TRACKER_TOKEN { + return tracker; + } + + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.write_instant = Some(start); + }); + tracker + } +} + +impl TimeTracker { + #[inline] + pub fn as_tracker_token(&self) -> Option { + if self.token == INVALID_TRACKER_TOKEN { + None + } else { + Some(self.token) + } + } + + #[inline] + pub fn observe( + &self, + now: std::time::Instant, + local_metric: &LocalHistogram, + tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, + ) -> u64 { + let dur = now.saturating_duration_since(self.start); + local_metric.observe(dur.as_secs_f64()); + if self.token == INVALID_TRACKER_TOKEN { + return 0; + } + GLOBAL_TRACKERS.with_tracker(self.token, |tracker| { + let metric = tracker_metric(tracker); + if *metric == 0 { + *metric = dur.as_nanos() as u64; + } + }); + dur.as_nanos() as u64 + } + + #[inline] + pub fn reset(&mut self, start: std::time::Instant) { + self.start = start; + } +} diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 3a4426fcbcb..c69875ae998 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -15,17 +15,6 @@ make_auto_flush_static_metric! { write_thread_wait, db_mutex_lock_nanos, } - pub label_enum ProposalType { - all, - local_read, - read_index, - unsafe_read_index, - normal, - transfer_leader, - conf_change, - batch, - dropped_read_index, - } pub label_enum WriteCmdType { put, @@ -44,7 +33,10 @@ make_auto_flush_static_metric! { commit_merge, rollback_merge, compact, - transfer_leader + transfer_leader, + prepare_flashback, + finish_flashback, + batch_switch_witness : "batch-switch-witness", } pub label_enum AdminCmdStatus { @@ -53,47 +45,11 @@ make_auto_flush_static_metric! { success, } - pub label_enum RaftReadyType { - message, - commit, - append, - snapshot, - pending_region, - has_ready_region, - } - - pub label_enum MessageCounterType { - append, - append_resp, - prevote, - prevote_resp, - vote, - vote_resp, - snapshot, - heartbeat, - heartbeat_resp, - transfer_leader, - timeout_now, - read_index, - read_index_resp, - } - - pub label_enum RaftDroppedMessage { - mismatch_store_id, - mismatch_region_epoch, - stale_msg, - region_overlap, - region_no_peer, - region_tombstone_peer, - region_nonexistent, - applying_snap, - disk_full, - } - pub label_enum SnapValidationType { stale, decode, epoch, + cancel, } pub label_enum RegionHashType { @@ -126,25 +82,10 @@ make_auto_flush_static_metric! { fetch_unused, } - pub label_enum RaftInvalidProposal { - mismatch_store_id, - region_not_found, - not_leader, - mismatch_peer_id, - stale_command, - epoch_not_match, - read_index_no_leader, - region_not_initialized, - is_applying_snapshot, - } - pub label_enum RaftEventDurationType { - compact_check, - pd_store_heartbeat, - snap_gc, - compact_lock_cf, - consistency_check, - cleanup_import_sst, - raft_engine_purge, + pub label_enum WarmUpEntryCacheType { + started, + timeout, + finished, } pub label_enum CompactionGuardAction { @@ -154,26 +95,14 @@ make_auto_flush_static_metric! { skip_partition, } - pub label_enum SendStatus { - accept, - drop, + pub struct RaftEntryFetches : LocalIntCounter { + "type" => RaftEntryType } - pub label_enum RaftLogGcSkippedReason { - reserve_log, - compact_idx_too_small, - threshold_limit, + pub struct WarmUpEntryCacheCounter : LocalIntCounter { + "type" => WarmUpEntryCacheType } - pub struct RaftEventDuration : LocalHistogram { - "type" => RaftEventDurationType - } - pub struct RaftInvalidProposalCount : LocalIntCounter { - "type" => RaftInvalidProposal - } - pub struct RaftEntryFetches : LocalIntCounter { - "type" => RaftEntryType - } pub struct SnapCf : LocalHistogram { "type" => CfNames, } @@ -184,9 +113,6 @@ make_auto_flush_static_metric! { "type" => RegionHashType, "result" => RegionHashResult, } - pub struct ProposalVec: LocalIntCounter { - "type" => ProposalType, - } pub struct AdminCmdVec : LocalIntCounter { "type" => AdminCmdType, @@ -197,19 +123,6 @@ make_auto_flush_static_metric! { "type" => WriteCmdType, } - pub struct RaftReadyVec : LocalIntCounter { - "type" => RaftReadyType, - } - - pub struct MessageCounterVec : LocalIntCounter { - "type" => MessageCounterType, - "status" => SendStatus, - } - - pub struct RaftDropedVec : LocalIntCounter { - "type" => RaftDroppedMessage, - } - pub struct SnapValidVec : LocalIntCounter { "type" => SnapValidationType } @@ -221,19 +134,167 @@ make_auto_flush_static_metric! { "cf" => CfNames, "type" => CompactionGuardAction, } - - pub struct RaftLogGcSkippedVec: LocalIntCounter { - "reason" => RaftLogGcSkippedReason, - } } make_static_metric! { + pub label_enum RaftReadyType { + message, + commit, + append, + snapshot, + pending_region, + has_ready_region, + } + + pub label_enum RaftSentMessageCounterType { + append, + append_resp, + prevote, + prevote_resp, + vote, + vote_resp, + snapshot, + heartbeat, + heartbeat_resp, + transfer_leader, + timeout_now, + read_index, + read_index_resp, + } + + pub label_enum SendStatus { + accept, + drop, + } + + pub label_enum RaftDroppedMessage { + mismatch_store_id, + mismatch_region_epoch, + mismatch_witness_snapshot, + stale_msg, + region_overlap, + region_no_peer, + region_tombstone_peer, + region_nonexistent, + applying_snap, + disk_full, + non_witness, + recovery, + } + + pub label_enum ProposalType { + all, + local_read, + read_index, + unsafe_read_index, + normal, + transfer_leader, + conf_change, + batch, + dropped_read_index, + } + + pub label_enum RaftInvalidProposal { + mismatch_store_id, + region_not_found, + not_leader, + mismatch_peer_id, + stale_command, + epoch_not_match, + read_index_no_leader, + region_not_initialized, + is_applying_snapshot, + force_leader, + witness, + flashback_in_progress, + flashback_not_prepared, + non_witness, + } + + pub label_enum RaftEventDurationType { + compact_check, + pd_store_heartbeat, + snap_gc, + compact_lock_cf, + consistency_check, + cleanup_import_sst, + raft_engine_purge, + peer_msg, + store_msg, + } + + pub label_enum RaftLogGcSkippedReason { + reserve_log, + compact_idx_too_small, + threshold_limit, + } + + pub label_enum LoadBaseSplitEventType { + // Workload fits the QPS threshold or byte threshold. + load_fit, + // Workload fits the CPU threshold. + cpu_load_fit, + // The statistical key is empty. + empty_statistical_key, + // Split info has been collected, ready to split. + ready_to_split, + // Split info has not been collected yet, not ready to split. + not_ready_to_split, + // The number of sampled keys does not meet the threshold. + no_enough_sampled_key, + // The number of sampled keys located on left and right does not meet the threshold. + no_enough_lr_key, + // The number of balanced keys does not meet the score. + no_balance_key, + // The number of contained keys does not meet the score. + no_uncross_key, + // Split info for the top hot CPU region has been collected, ready to split. + ready_to_split_cpu_top, + // Hottest key range for the top hot CPU region could not be found. + empty_hottest_key_range, + // The top hot CPU region could not be split. + unable_to_split_cpu_top, + } + pub struct HibernatedPeerStateGauge: IntGauge { "state" => { awaken, hibernated, }, } + + pub struct RaftReadyCounterVec : LocalIntCounter { + "type" => RaftReadyType, + } + + pub struct RaftSentMessageCounterVec : LocalIntCounter { + "type" => RaftSentMessageCounterType, + "status" => SendStatus, + } + + pub struct RaftDroppedMessageCounterVec : LocalIntCounter { + "type" => RaftDroppedMessage, + } + + pub struct RaftProposalCounterVec: LocalIntCounter { + "type" => ProposalType, + } + + pub struct RaftInvalidProposalCounterVec : LocalIntCounter { + "type" => RaftInvalidProposal + } + + pub struct RaftEventDurationVec : LocalHistogram { + "type" => RaftEventDurationType + } + + pub struct RaftLogGcSkippedCounterVec: LocalIntCounter { + "reason" => RaftLogGcSkippedReason, + } + + pub struct LoadBaseSplitEventCounterVec: IntCounter { + "type" => LoadBaseSplitEventType, + } } lazy_static! { @@ -324,6 +385,12 @@ lazy_static! { "Bucketed histogram of proposals' send to write queue duration.", exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref STORE_WF_SEND_PROPOSAL_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_store_wf_send_proposal_duration_seconds", + "Bucketed histogram of proposals' waterfall send duration", + exponential_buckets(1e-6, 2.0, 26).unwrap() + ).unwrap(); pub static ref STORE_WF_BEFORE_WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_store_wf_before_write_duration_seconds", @@ -367,8 +434,6 @@ lazy_static! { "Total number of proposal made.", &["type"] ).unwrap(); - pub static ref PEER_PROPOSAL_COUNTER: ProposalVec = - auto_flush_from!(PEER_PROPOSAL_COUNTER_VEC, ProposalVec); pub static ref PEER_ADMIN_CMD_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -392,21 +457,21 @@ lazy_static! { register_histogram!( "tikv_raftstore_commit_log_duration_seconds", "Bucketed histogram of peer commits logs duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_APPLY_LOG_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_apply_log_duration_seconds", "Bucketed histogram of peer applying log duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref APPLY_TASK_WAIT_TIME_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_apply_wait_time_duration_secs", "Bucketed histogram of apply task wait time duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_RAFT_READY_COUNTER_VEC: IntCounterVec = @@ -415,8 +480,6 @@ lazy_static! { "Total number of raft ready handled.", &["type"] ).unwrap(); - pub static ref STORE_RAFT_READY_COUNTER: RaftReadyVec = - auto_flush_from!(STORE_RAFT_READY_COUNTER_VEC, RaftReadyVec); pub static ref STORE_RAFT_SENT_MESSAGE_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -424,8 +487,6 @@ lazy_static! { "Total number of raft ready sent messages.", &["type", "status"] ).unwrap(); - pub static ref STORE_RAFT_SENT_MESSAGE_COUNTER: MessageCounterVec = - auto_flush_from!(STORE_RAFT_SENT_MESSAGE_COUNTER_VEC, MessageCounterVec); pub static ref STORE_RAFT_DROPPED_MESSAGE_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -433,8 +494,6 @@ lazy_static! { "Total number of raft dropped messages.", &["type"] ).unwrap(); - pub static ref STORE_RAFT_DROPPED_MESSAGE_COUNTER: RaftDropedVec = - auto_flush_from!(STORE_RAFT_DROPPED_MESSAGE_COUNTER_VEC, RaftDropedVec); pub static ref STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC: IntGaugeVec = register_int_gauge_vec!( @@ -457,7 +516,7 @@ lazy_static! { "tikv_raftstore_raft_process_duration_secs", "Bucketed histogram of peer processing raft duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref PEER_PROPOSE_LOG_SIZE_HISTOGRAM: Histogram = @@ -488,7 +547,7 @@ lazy_static! { register_histogram!( "tikv_raftstore_request_wait_time_duration_secs", "Bucketed histogram of request wait time duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref PEER_GC_RAFT_LOG_COUNTER: IntCounter = @@ -568,6 +627,23 @@ lazy_static! { pub static ref RAFT_ENTRY_FETCHES: RaftEntryFetches = auto_flush_from!(RAFT_ENTRY_FETCHES_VEC, RaftEntryFetches); + // The max task duration can be a few minutes. + pub static ref RAFT_ENTRY_FETCHES_TASK_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_entry_fetches_task_duration_seconds", + "Bucketed histogram of raft entry fetches task duration.", + exponential_buckets(0.0005, 2.0, 21).unwrap() // 500us ~ 8.7m + ).unwrap(); + + pub static ref WARM_UP_ENTRY_CACHE_COUNTER_VEC: IntCounterVec = + register_int_counter_vec!( + "tikv_raftstore_prefill_entry_cache_total", + "Total number of prefill entry cache.", + &["type"] + ).unwrap(); + pub static ref WARM_UP_ENTRY_CACHE_COUNTER: WarmUpEntryCacheCounter = + auto_flush_from!(WARM_UP_ENTRY_CACHE_COUNTER_VEC, WarmUpEntryCacheCounter); + pub static ref LEADER_MISSING: IntGauge = register_int_gauge!( "tikv_raftstore_leader_missing", @@ -587,8 +663,6 @@ lazy_static! { "Total number of raft invalid proposal.", &["type"] ).unwrap(); - pub static ref RAFT_INVALID_PROPOSAL_COUNTER: RaftInvalidProposalCount = - auto_flush_from!(RAFT_INVALID_PROPOSAL_COUNTER_VEC, RaftInvalidProposalCount); pub static ref RAFT_EVENT_DURATION_VEC: HistogramVec = register_histogram_vec!( @@ -597,8 +671,13 @@ lazy_static! { &["type"], exponential_buckets(0.001, 1.59, 20).unwrap() // max 10s ).unwrap(); - pub static ref RAFT_EVENT_DURATION: RaftEventDuration = - auto_flush_from!(RAFT_EVENT_DURATION_VEC, RaftEventDuration); + + pub static ref PEER_MSG_LEN: Histogram = + register_histogram!( + "tikv_raftstore_peer_msg_len", + "Length of peer msg.", + exponential_buckets(1.0, 2.0, 20).unwrap() // max 1000s + ).unwrap(); pub static ref RAFT_READ_INDEX_PENDING_DURATION: Histogram = register_histogram!( @@ -613,28 +692,6 @@ lazy_static! { "Pending read index count." ).unwrap(); - pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = - register_histogram_vec!( - "tikv_raftstore_apply_perf_context_time_duration_secs", - "Bucketed histogram of request wait time duration.", - &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() - ).unwrap(); - - pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = - register_histogram_vec!( - "tikv_raftstore_store_perf_context_time_duration_secs", - "Bucketed histogram of request wait time duration.", - &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() - ).unwrap(); - - pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration= - auto_flush_from!(APPLY_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); - - pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration= - auto_flush_from!(STORE_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); - pub static ref READ_QPS_TOPN: GaugeVec = register_gauge_vec!( "tikv_read_qps_topn", @@ -642,8 +699,9 @@ lazy_static! { &["order"] ).unwrap(); - pub static ref LOAD_BASE_SPLIT_EVENT: IntCounterVec = - register_int_counter_vec!( + pub static ref LOAD_BASE_SPLIT_EVENT: LoadBaseSplitEventCounterVec = + register_static_int_counter_vec!( + LoadBaseSplitEventCounterVec, "tikv_load_base_split_event", "Load base split event.", &["type"] @@ -656,6 +714,11 @@ lazy_static! { linear_buckets(0.0, 0.05, 20).unwrap() ).unwrap(); + pub static ref LOAD_BASE_SPLIT_DURATION_HISTOGRAM : Histogram = register_histogram!( + "tikv_load_base_split_duration_seconds", + "Histogram of the time load base split costs in seconds" + ).unwrap(); + pub static ref QUERY_REGION_VEC: HistogramVec = register_histogram_vec!( "tikv_query_region", "Histogram of query", @@ -663,11 +726,10 @@ lazy_static! { exponential_buckets(8.0, 2.0, 24).unwrap() ).unwrap(); - pub static ref RAFT_ENTRIES_CACHES_GAUGE: IntGauge = register_int_gauge!( "tikv_raft_entries_caches", "Total memory size of raft entries caches." - ).unwrap(); + ).unwrap(); pub static ref RAFT_ENTRIES_EVICT_BYTES: IntCounter = register_int_counter!( "tikv_raft_entries_evict_bytes", @@ -712,18 +774,105 @@ lazy_static! { "tikv_raftstore_inspect_duration_seconds", "Bucketed histogram of inspect duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_SLOW_SCORE_GAUGE: Gauge = register_gauge!("tikv_raftstore_slow_score", "Slow score of the store.").unwrap(); + pub static ref STORE_SLOW_TREND_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend", "Slow trend changing rate").unwrap(); + + pub static ref STORE_SLOW_TREND_L0_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l0", "Slow trend L0 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l1", "Slow trend L1 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l2", "Slow trend L2 window avg value.").unwrap(); + + pub static ref STORE_SLOW_TREND_L0_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l0_l1", "Slow trend changing rate: L0/L1.").unwrap(); + pub static ref STORE_SLOW_TREND_L1_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l1_l2", "Slow trend changing rate: L1/L2.").unwrap(); + + pub static ref STORE_SLOW_TREND_L1_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l1_margin_error", "Slow trend: L1 margin error range").unwrap(); + pub static ref STORE_SLOW_TREND_L2_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l2_margin_error", "Slow trend: L2 margin error range").unwrap(); + + pub static ref STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_margin_error_gap", + "Slow trend: the gap between margin window time and current sampling time", + &["window"] + ).unwrap(); + + pub static ref STORE_SLOW_TREND_MISC_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_misc", + "Slow trend uncatelogued gauge(s)", + &["type"] + ).unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_VALUE_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_value", "Store slow trend result meantime value").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result", "Store slow trend result changing rate").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_L0_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l0", "Slow trend result L0 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l1", "Slow trend result L1 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l2", "Slow trend result L2 window avg value.").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_L0_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l0_l1", "Slow trend result changing rate: L0/L1.").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L1_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l1_l2", "Slow trend result changing rate: L1/L2.").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_L1_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l1_margin_error", "Slow trend result: L1 margin error range").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L2_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l2_margin_error", "Slow trend result: L2 margin error range").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_result_margin_error_gap", + "Slow trend result: the gap between margin window time and current sampling time", + &["window"] + ).unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_result_misc", + "Slow trend result uncatelogued gauge(s)", + &["type"] + ).unwrap(); + pub static ref RAFT_LOG_GC_SKIPPED_VEC: IntCounterVec = register_int_counter_vec!( "tikv_raftstore_raft_log_gc_skipped", "Total number of skipped raft log gc.", &["reason"] ) .unwrap(); - pub static ref RAFT_LOG_GC_SKIPPED: RaftLogGcSkippedVec = - auto_flush_from!(RAFT_LOG_GC_SKIPPED_VEC, RaftLogGcSkippedVec); + + pub static ref RAFT_APPLYING_SST_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "tikv_raft_applying_sst", + "Sum of applying sst.", + &["type"] + ).unwrap(); + + pub static ref SNAPSHOT_LIMIT_GENERATE_BYTES: IntCounter = register_int_counter!( + "tikv_snapshot_limit_generate_bytes", + "Total snapshot generate limit used", + ) + .unwrap(); + + pub static ref MESSAGE_RECV_BY_STORE: IntCounterVec = register_int_counter_vec!( + "tikv_raftstore_message_recv_by_store", + "Messages received by store", + &["store"] + ) + .unwrap(); } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index b1b8da54e2b..7a2c04e2450 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -2,10 +2,15 @@ pub mod cmd_resp; pub mod config; +pub mod entry_storage; pub mod fsm; +pub mod local_metrics; pub mod memory; pub mod metrics; pub mod msg; +mod peer; +mod read_queue; +pub mod region_meta; pub mod transport; #[macro_use] pub mod util; @@ -14,26 +19,33 @@ mod async_io; mod bootstrap; mod compaction_guard; mod hibernate_state; -mod local_metrics; -mod peer; mod peer_storage; -mod read_queue; mod region_snapshot; mod replication_mode; -mod snap; +pub mod simple_write; +pub mod snap; mod txn_ext; mod worker; #[cfg(any(test, feature = "testexport"))] pub use self::msg::PeerInternalStat; pub use self::{ + async_io::{ + read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, + write::{ + write_to_db_for_test, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, + WriteTask, + }, + write_router::{WriteRouter, WriteRouterContext, WriteSenders}, + }, bootstrap::{ bootstrap_store, clear_prepare_bootstrap_cluster, clear_prepare_bootstrap_key, initial_region, prepare_bootstrap_cluster, }, compaction_guard::CompactionGuardGeneratorFactory, config::Config, - fsm::{DestroyPeerJob, RaftRouter, StoreInfo}, + entry_storage::{EntryStorage, RaftlogFetchResult, MAX_INIT_ENTRY_COUNT}, + fsm::{check_sst_for_ingestion, DestroyPeerJob, RaftRouter, StoreInfo}, hibernate_state::{GroupState, HibernateState}, memory::*, metrics::RAFT_ENTRY_FETCHES_VEC, @@ -42,28 +54,38 @@ pub use self::{ PeerTick, RaftCmdExtraOpts, RaftCommand, ReadCallback, ReadResponse, SignificantMsg, StoreMsg, StoreTick, WriteCallback, WriteResponse, }, - peer::{AbstractPeer, Peer, PeerStat, ProposalContext, RequestInspector, RequestPolicy}, + peer::{ + can_amend_read, get_sync_log_from_request, make_transfer_leader_response, + propose_read_index, should_renew_lease, Peer, PeerStat, ProposalContext, ProposalQueue, + RequestInspector, RequestPolicy, SnapshotRecoveryWaitApplySyncer, + TRANSFER_LEADER_COMMAND_REPLY_CTX, + }, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, - write_peer_state, PeerStorage, RaftlogFetchResult, SnapState, INIT_EPOCH_CONF_VER, - INIT_EPOCH_VER, MAX_INIT_ENTRY_COUNT, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + write_peer_state, PeerStorage, SnapState, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, - read_queue::ReadIndexContext, + read_queue::{ReadIndexContext, ReadIndexQueue, ReadIndexRequest}, region_snapshot::{RegionIterator, RegionSnapshot}, replication_mode::{GlobalReplicationState, StoreGroup}, snap::{ check_abort, copy_snapshot, snap_io::{apply_sst_cf_file, build_sst_cf_file_list}, ApplyOptions, CfFile, Error as SnapError, SnapEntry, SnapKey, SnapManager, - SnapManagerBuilder, Snapshot, SnapshotStatistics, + SnapManagerBuilder, Snapshot, SnapshotStatistics, TabletSnapKey, TabletSnapManager, }, transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter, Transport}, txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ - AutoSplitController, Bucket, BucketRange, CheckLeaderRunner, CheckLeaderTask, - FlowStatistics, FlowStatsReporter, KeyEntry, LocalReader, PdTask, QueryStats, ReadDelegate, - ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, - SplitConfigManager, TrackVer, WriteStats, + metrics as worker_metrics, AutoSplitController, Bucket, BucketRange, CachedReadDelegate, + CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }, }; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 46903771344..c36e9880694 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -7,7 +7,9 @@ use std::{borrow::Cow, fmt}; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; +use futures::channel::mpsc::UnboundedSender; use kvproto::{ + brpb::CheckAdminResponse, import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, @@ -19,21 +21,24 @@ use kvproto::{ }; #[cfg(any(test, feature = "testexport"))] use pd_client::BucketMeta; -use raft::{GetEntriesContext, SnapshotStatus}; +use raft::SnapshotStatus; +use resource_control::ResourceMetered; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; +use tracker::{get_tls_tracker_token, TrackerToken}; -use super::{AbstractPeer, RegionSnapshot}; +use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, peer::{ - UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, - UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, + SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, + UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, + UnsafeRecoveryWaitApplySyncer, }, util::{KeysInfoFormatter, LatencyInspector}, worker::{Bucket, BucketRange}, - RaftlogFetchResult, SnapKey, + SnapKey, }; #[derive(Debug)] @@ -72,33 +77,41 @@ where } } -pub type ReadCallback = Box) + Send>; -pub type WriteCallback = Box; +pub type BoxReadCallback = Box) + Send>; +pub type BoxWriteCallback = Box; pub type ExtCallback = Box; + #[cfg(any(test, feature = "testexport"))] pub type TestCallback = Box; /// Variants of callbacks for `Msg`. /// - `Read`: a callback for read only requests including `StatusRequest`, -/// `GetRequest` and `SnapRequest` +/// `GetRequest` and `SnapRequest` /// - `Write`: a callback for write only requests including `AdminRequest` -/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. +/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. pub enum Callback { /// No callback. None, /// Read callback. - Read(ReadCallback), + Read { + cb: BoxReadCallback, + + tracker: TrackerToken, + }, /// Write callback. Write { - cb: WriteCallback, - /// `proposed_cb` is called after a request is proposed to the raft group successfully. - /// It's used to notify the caller to move on early because it's very likely the request - /// will be applied to the raftstore. + cb: BoxWriteCallback, + /// `proposed_cb` is called after a request is proposed to the raft + /// group successfully. It's used to notify the caller to move on early + /// because it's very likely the request will be applied to the + /// raftstore. proposed_cb: Option, - /// `committed_cb` is called after a request is committed and before it's being applied, and - /// it's guaranteed that the request will be successfully applied soon. + /// `committed_cb` is called after a request is committed and before + /// it's being applied, and it's guaranteed that the request will be + /// successfully applied soon. committed_cb: Option, - request_times: SmallVec<[Instant; 4]>, + + trackers: SmallVec<[TimeTracker; 4]>, }, #[cfg(any(test, feature = "testexport"))] /// Test purpose callback @@ -111,40 +124,40 @@ impl Callback where S: Snapshot, { - pub fn write(cb: WriteCallback) -> Self { + pub fn read(cb: BoxReadCallback) -> Self { + let tracker = get_tls_tracker_token(); + Callback::Read { cb, tracker } + } + + pub fn write(cb: BoxWriteCallback) -> Self { Self::write_ext(cb, None, None) } pub fn write_ext( - cb: WriteCallback, + cb: BoxWriteCallback, proposed_cb: Option, committed_cb: Option, ) -> Self { + let tracker = TimeTracker::default(); + Callback::Write { cb, proposed_cb, committed_cb, - request_times: smallvec![Instant::now()], - } - } - - pub fn get_request_times(&self) -> Option<&SmallVec<[Instant; 4]>> { - match self { - Callback::Write { request_times, .. } => Some(request_times), - _ => None, + trackers: smallvec![tracker], } } pub fn invoke_with_response(self, resp: RaftCmdResponse) { match self { Callback::None => (), - Callback::Read(read) => { + Callback::Read { cb, .. } => { let resp = ReadResponse { response: resp, snapshot: None, txn_extra_op: TxnExtraOp::Noop, }; - read(resp); + cb(resp); } Callback::Write { cb, .. } => { let resp = WriteResponse { response: resp }; @@ -155,38 +168,189 @@ where } } - pub fn has_proposed_cb(&mut self) -> bool { - if let Callback::Write { proposed_cb, .. } = self { - proposed_cb.is_some() - } else { - false - } + pub fn has_proposed_cb(&self) -> bool { + let Callback::Write { proposed_cb, .. } = self else { return false; }; + proposed_cb.is_some() } pub fn invoke_proposed(&mut self) { - if let Callback::Write { proposed_cb, .. } = self { - if let Some(cb) = proposed_cb.take() { - cb() - } + let Callback::Write { proposed_cb, .. } = self else { return; }; + if let Some(cb) = proposed_cb.take() { + cb(); } } pub fn invoke_committed(&mut self) { - if let Callback::Write { committed_cb, .. } = self { - if let Some(cb) = committed_cb.take() { - cb() - } + let Callback::Write { committed_cb, .. } = self else { return; }; + if let Some(cb) = committed_cb.take() { + cb(); } } pub fn invoke_read(self, args: ReadResponse) { match self { - Callback::Read(read) => read(args), - other => panic!("expect Callback::Read(..), got {:?}", other), + Callback::Read { cb, .. } => cb(args), + other => panic!("expect Callback::read(..), got {:?}", other), + } + } + + pub fn take_proposed_cb(&mut self) -> Option { + let Callback::Write { proposed_cb, .. } = self else { return None; }; + proposed_cb.take() + } + + pub fn take_committed_cb(&mut self) -> Option { + let Callback::Write { committed_cb, .. } = self else { return None; }; + committed_cb.take() + } +} + +pub trait ReadCallback: ErrorCallback { + type Response; + + fn set_result(self, result: Self::Response); + fn read_tracker(&self) -> Option; +} + +pub trait WriteCallback: ErrorCallback { + type Response; + + fn notify_proposed(&mut self); + fn notify_committed(&mut self); + + type TimeTrackerListRef<'a>: IntoIterator + where + Self: 'a; + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_>; + + type TimeTrackerListMut<'a>: IntoIterator + where + Self: 'a; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_>; + fn set_result(self, result: Self::Response); +} + +pub trait ErrorCallback: Send { + fn report_error(self, err: RaftCmdResponse); + fn is_none(&self) -> bool; +} + +impl ErrorCallback for Vec { + #[inline] + fn report_error(self, err: RaftCmdResponse) { + for cb in self { + cb.report_error(err.clone()); } } - pub fn is_none(&self) -> bool { + #[inline] + fn is_none(&self) -> bool { + self.iter().all(|c| c.is_none()) + } +} + +impl ReadCallback for Callback { + type Response = ReadResponse; + + #[inline] + fn set_result(self, result: Self::Response) { + self.invoke_read(result); + } + + fn read_tracker(&self) -> Option { + let Callback::Read { tracker, .. } = self else { return None; }; + Some(*tracker) + } +} + +impl WriteCallback for Callback { + type Response = RaftCmdResponse; + + #[inline] + fn notify_proposed(&mut self) { + self.invoke_proposed(); + } + + #[inline] + fn notify_committed(&mut self) { + self.invoke_committed(); + } + + type TimeTrackerListRef<'a> = impl IntoIterator; + #[inline] + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() + } + + type TimeTrackerListMut<'a> = impl IntoIterator; + #[inline] + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() + } + + #[inline] + fn set_result(self, result: Self::Response) { + self.invoke_with_response(result); + } +} + +impl WriteCallback for Vec +where + C: WriteCallback + 'static, + C::Response: Clone, +{ + type Response = C::Response; + + #[inline] + fn notify_proposed(&mut self) { + for c in self { + c.notify_proposed(); + } + } + + #[inline] + fn notify_committed(&mut self) { + for c in self { + c.notify_committed(); + } + } + + type TimeTrackerListRef<'a> = impl Iterator + 'a; + #[inline] + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + self.iter().flat_map(|c| c.write_trackers()) + } + + type TimeTrackerListMut<'a> = impl Iterator + 'a; + #[inline] + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + self.iter_mut().flat_map(|c| c.write_trackers_mut()) + } + + #[inline] + fn set_result(self, result: Self::Response) { + for c in self { + c.set_result(result.clone()); + } + } +} + +impl ErrorCallback for Callback { + #[inline] + fn report_error(self, err: RaftCmdResponse) { + self.invoke_with_response(err); + } + + #[inline] + fn is_none(&self) -> bool { matches!(self, Callback::None) } } @@ -198,7 +362,7 @@ where fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Callback::None => write!(fmt, "Callback::None"), - Callback::Read(_) => write!(fmt, "Callback::Read(..)"), + Callback::Read { .. } => write!(fmt, "Callback::Read(..)"), Callback::Write { .. } => write!(fmt, "Callback::Write(..)"), #[cfg(any(test, feature = "testexport"))] Callback::Test { .. } => write!(fmt, "Callback::Test(..)"), @@ -206,7 +370,7 @@ where } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] pub enum PeerTick { Raft = 0, @@ -219,6 +383,10 @@ pub enum PeerTick { CheckLeaderLease = 7, ReactivateMemoryLock = 8, ReportBuckets = 9, + CheckLongUncommitted = 10, + CheckPeersAvailability = 11, + RequestSnapshot = 12, + RequestVoterReplicatedIndex = 13, } impl PeerTick { @@ -237,6 +405,10 @@ impl PeerTick { PeerTick::CheckLeaderLease => "check_leader_lease", PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", PeerTick::ReportBuckets => "report_buckets", + PeerTick::CheckLongUncommitted => "check_long_uncommitted", + PeerTick::CheckPeersAvailability => "check_peers_availability", + PeerTick::RequestSnapshot => "request_snapshot", + PeerTick::RequestVoterReplicatedIndex => "request_voter_replicated_index", } } @@ -252,6 +424,10 @@ impl PeerTick { PeerTick::CheckLeaderLease, PeerTick::ReactivateMemoryLock, PeerTick::ReportBuckets, + PeerTick::CheckLongUncommitted, + PeerTick::CheckPeersAvailability, + PeerTick::RequestSnapshot, + PeerTick::RequestVoterReplicatedIndex, ]; TICKS } @@ -286,18 +462,20 @@ pub enum MergeResultKind { /// Its target peer applys `CommitMerge` log. FromTargetLog, /// Its target peer receives snapshot. - /// In step 1, this peer should mark `pending_move` is true and destroy its apply fsm. - /// Then its target peer will remove this peer data and apply snapshot atomically. + /// In step 1, this peer should mark `pending_move` is true and destroy its + /// apply fsm. Then its target peer will remove this peer data and apply + /// snapshot atomically. FromTargetSnapshotStep1, /// In step 2, this peer should destroy its peer fsm. FromTargetSnapshotStep2, - /// This peer is no longer needed by its target peer so it can be destroyed by itself. - /// It happens if and only if its target peer has been removed by conf change. + /// This peer is no longer needed by its target peer so it can be destroyed + /// by itself. It happens if and only if its target peer has been removed by + /// conf change. Stale, } -/// Some significant messages sent to raftstore. Raftstore will dispatch these messages to Raft -/// groups to update some important internal status. +/// Some significant messages sent to raftstore. Raftstore will dispatch these +/// messages to Raft groups to update some important internal status. #[derive(Debug)] pub enum SignificantMsg where @@ -329,7 +507,7 @@ where store_id: u64, group_id: u64, }, - /// Capture the changes of the region. + /// Capture changes of a region. CaptureChange { cmd: ChangeObserver, region_epoch: RegionEpoch, @@ -338,10 +516,7 @@ where LeaderCallback(Callback), RaftLogGcFlushed, // Reports the result of asynchronous Raft logs fetching. - RaftlogFetched { - context: GetEntriesContext, - res: Box, - }, + RaftlogFetched(FetchedLogs), EnterForceLeaderState { syncer: UnsafeRecoveryForceLeaderSyncer, failed_stores: HashSet, @@ -354,6 +529,8 @@ where UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), + SnapshotRecoveryWaitApply(SnapshotRecoveryWaitApplySyncer), + CheckPendingAdmin(UnboundedSender), } /// Message that will be sent to a peer. @@ -377,7 +554,8 @@ pub enum CasualMessage { hash: Vec, }, - /// Approximate size of target region. This message can only be sent by split-check thread. + /// Approximate size of target region. This message can only be sent by + /// split-check thread. RegionApproximateSize { size: u64, }, @@ -389,9 +567,13 @@ pub enum CasualMessage { CompactionDeclinedBytes { bytes: u64, }, - /// Half split the target region. + /// Half split the target region with the given key range. + /// If the key range is not provided, the region's start key + /// and end key will be used by default. HalfSplitRegion { region_epoch: RegionEpoch, + start_key: Option>, + end_key: Option>, policy: CheckPolicy, source: &'static str, cb: Callback, @@ -412,7 +594,7 @@ pub enum CasualMessage { ForceCompactRaftLogs, /// A message to access peer's internal state. - AccessPeer(Box), + AccessPeer(Box), /// Region info from PD QueryRegionLeaderResp { @@ -553,6 +735,7 @@ pub struct InspectedRaftMessage { } /// Message that can be sent to a peer. +#[allow(clippy::large_enum_variant)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target @@ -562,15 +745,16 @@ pub enum PeerMsg { /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. RaftCommand(RaftCommand), - /// Tick is periodical task. If target peer doesn't exist there is a potential - /// that the raft node will not work anymore. + /// Tick is periodical task. If target peer doesn't exist there is a + /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. ApplyRes { res: ApplyTaskRes, }, - /// Message that can't be lost but rarely created. If they are lost, real bad - /// things happen like some peers will be considered dead in the group. + /// Message that can't be lost but rarely created. If they are lost, real + /// bad things happen like some peers will be considered dead in the + /// group. SignificantMsg(SignificantMsg), /// Start the FSM. Start, @@ -589,6 +773,8 @@ pub enum PeerMsg { Destroy(u64), } +impl ResourceMetered for PeerMsg {} + impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -619,6 +805,18 @@ impl fmt::Debug for PeerMsg { } } +impl PeerMsg { + /// For some specific kind of messages, it's actually acceptable if failed + /// to send it by `significant_send`. This function determine if the + /// current message is acceptable to fail. + pub fn is_send_failure_ignorable(&self) -> bool { + matches!( + self, + PeerMsg::SignificantMsg(SignificantMsg::CaptureChange { .. }) + ) + } +} + pub enum StoreMsg where EK: KvEngine, @@ -629,8 +827,8 @@ where invalid_ssts: Vec, }, - // Clear region size and keys for all regions in the range, so we can force them to re-calculate - // their size later. + // Clear region size and keys for all regions in the range, so we can force them to + // re-calculate their size later. ClearRegionSizeInRange { start_key: Vec, end_key: Vec, @@ -666,8 +864,14 @@ where }, GcSnapshotFinish, + + AwakenRegions { + abnormal_stores: Vec, + }, } +impl ResourceMetered for StoreMsg {} + impl fmt::Debug for StoreMsg where EK: KvEngine, @@ -699,6 +903,7 @@ where write!(fmt, "UnsafeRecoveryCreatePeer") } StoreMsg::GcSnapshotFinish => write!(fmt, "GcSnapshotFinish"), + StoreMsg::AwakenRegions { .. } => write!(fmt, "AwakenRegions"), } } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9c480182943..8dc69a0def4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -8,6 +8,7 @@ use std::{ fmt, mem, sync::{ atomic::{AtomicUsize, Ordering}, + mpsc::SyncSender, Arc, Mutex, }, time::{Duration, Instant}, @@ -23,16 +24,15 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; -use getset::Getters; +use getset::{Getters, MutGetters}; use kvproto::{ errorpb, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}, metapb::{self, PeerRole}, pdpb::{self, PeerStats}, raft_cmdpb::{ - self, AdminCmdType, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - PutRequest, RaftCmdRequest, RaftCmdResponse, Request, TransferLeaderRequest, - TransferLeaderResponse, + self, AdminCmdType, AdminResponse, CmdType, CommitMergeRequest, PutRequest, RaftCmdRequest, + RaftCmdResponse, Request, TransferLeaderRequest, TransferLeaderResponse, }, raft_serverpb::{ ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftApplyState, RaftMessage, @@ -46,11 +46,10 @@ use pd_client::{BucketStat, INVALID_ID}; use protobuf::Message; use raft::{ self, - eraftpb::{self, ConfChangeType, Entry, EntryType, MessageType}, - Changer, GetEntriesContext, LightReady, ProgressState, ProgressTracker, RawNode, Ready, - SnapshotStatus, StateRole, INVALID_INDEX, NO_LIMIT, + eraftpb::{self, Entry, EntryType, MessageType}, + GetEntriesContext, LightReady, ProgressState, RawNode, Ready, SnapshotStatus, StateRole, + INVALID_INDEX, NO_LIMIT, }; -use raft_proto::ConfChangeI; use rand::seq::SliceRandom; use smallvec::SmallVec; use tikv_alloc::trace::TraceEvent; @@ -59,33 +58,35 @@ use tikv_util::{ codec::number::decode_u64, debug, error, info, sys::disk::DiskUsage, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, InstantExt, ThreadReadId}, + time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, InstantExt}, warn, worker::Scheduler, Either, }; -use time::Timespec; -use txn_types::WriteBatchFlags; +use time::{Duration as TimeDuration, Timespec}; +use tracker::GLOBAL_TRACKERS; +use txn_types::{TimeStamp, WriteBatchFlags}; use uuid::Uuid; use super::{ cmd_resp, - local_metrics::{RaftMetrics, RaftReadyMetrics}, + local_metrics::RaftMetrics, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, transport::Transport, util::{ - self, check_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, - Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, + self, check_req_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, + ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, - DestroyPeerJob, + DestroyPeerJob, LocalReadContext, }; use crate::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason, RoleChange}, errors::RAFTSTORE_IS_BUSY, + router::RaftStoreRouter, store::{ - async_io::{write::WriteMsg, write_router::WriteRouter}, + async_io::{read::ReadTask, write::WriteMsg, write_router::WriteRouter}, fsm::{ apply::{self, CatchUpLogs}, store::{PollContext, RaftRouter}, @@ -93,26 +94,29 @@ use crate::{ }, hibernate_state::GroupState, memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}, - msg::{PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + msg::{CasualMessage, ErrorCallback, PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + peer_storage::HandleSnapshotResult, txn_ext::LocksStatus, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ - HeartbeatTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadExecutor, - ReadProgress, RegionTask, SplitCheckTask, + HeartbeatTask, RaftlogGcTask, ReadDelegate, ReadExecutor, ReadProgress, RegionTask, + SplitCheckTask, }, - Callback, Config, GlobalReplicationState, PdTask, ReadIndexContext, ReadResponse, TxnExt, - RAFT_INIT_LOG_INDEX, + Callback, Config, GlobalReplicationState, PdTask, ReadCallback, ReadIndexContext, + ReadResponse, TxnExt, WriteCallback, RAFT_INIT_LOG_INDEX, }, Error, Result, }; const SHRINK_CACHE_CAPACITY: usize = 64; -const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; // 1s +// 1s +const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; const REGION_READ_PROGRESS_CAP: usize = 128; -const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; +#[doc(hidden)] +pub const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; /// The returned states of the peer after checking whether it is stale -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq)] pub enum StaleState { Valid, ToValidate, @@ -120,37 +124,36 @@ pub enum StaleState { } #[derive(Debug)] -struct ProposalQueue -where - S: Snapshot, -{ - tag: String, - queue: VecDeque>, +pub struct ProposalQueue { + region_id: u64, + peer_id: u64, + queue: VecDeque>, } -impl ProposalQueue { - fn new(tag: String) -> ProposalQueue { +impl ProposalQueue { + pub fn new(region_id: u64, peer_id: u64) -> ProposalQueue { ProposalQueue { - tag, + region_id, + peer_id, queue: VecDeque::new(), } } - /// Find the request times of given index. - /// Caller should check if term is matched before using request times. - fn find_request_times(&self, index: u64) -> Option<(u64, &SmallVec<[TiInstant; 4]>)> { + /// Find the trackers of given index. + /// Caller should check if term is matched before using trackers. + pub fn find_trackers(&self, index: u64) -> Option<(u64, C::TimeTrackerListRef<'_>)> { self.queue .binary_search_by_key(&index, |p: &Proposal<_>| p.index) .ok() - .and_then(|i| { - self.queue[i] - .cb - .get_request_times() - .map(|ts| (self.queue[i].term, ts)) - }) + .map(|i| (self.queue[i].term, self.queue[i].cb.write_trackers())) + } + + #[inline] + pub fn queue_mut(&mut self) -> &mut VecDeque> { + &mut self.queue } - fn find_propose_time(&self, term: u64, index: u64) -> Option { + pub fn find_propose_time(&self, term: u64, index: u64) -> Option { self.queue .binary_search_by_key(&(term, index), |p: &Proposal<_>| (p.term, p.index)) .ok() @@ -158,7 +161,7 @@ impl ProposalQueue { } // Find proposal in front or at the given term and index - fn pop(&mut self, term: u64, index: u64) -> Option> { + pub fn pop(&mut self, term: u64, index: u64) -> Option> { self.queue.pop_front().and_then(|p| { // Comparing the term first then the index, because the term is // increasing among all log entries and the index is increasing @@ -173,15 +176,20 @@ impl ProposalQueue { /// Find proposal at the given term and index and notify stale proposals /// in front that term and index - fn find_proposal(&mut self, term: u64, index: u64, current_term: u64) -> Option> { + pub fn find_proposal( + &mut self, + term: u64, + index: u64, + current_term: u64, + ) -> Option> { while let Some(p) = self.pop(term, index) { if p.term == term { if p.index == index { return if p.cb.is_none() { None } else { Some(p) }; } else { panic!( - "{} unexpected callback at term {}, found index {}, expected {}", - self.tag, term, p.index, index + "[region {}] {} unexpected callback at term {}, found index {}, expected {}", + self.region_id, self.peer_id, term, p.index, index ); } } else { @@ -191,7 +199,12 @@ impl ProposalQueue { None } - fn push(&mut self, p: Proposal) { + #[inline] + pub fn oldest(&self) -> Option<&Proposal> { + self.queue.front() + } + + pub fn push(&mut self, p: Proposal) { if let Some(f) = self.queue.back() { // The term must be increasing among all log entries and the index // must be increasing inside a given term @@ -200,18 +213,18 @@ impl ProposalQueue { self.queue.push_back(p); } - fn is_empty(&self) -> bool { + pub fn is_empty(&self) -> bool { self.queue.is_empty() } - fn gc(&mut self) { + pub fn gc(&mut self) { if self.queue.capacity() > SHRINK_CACHE_CAPACITY && self.queue.len() < SHRINK_CACHE_CAPACITY { self.queue.shrink_to_fit(); } } - fn back(&self) -> Option<&Proposal> { + fn back(&self) -> Option<&Proposal> { self.queue.back() } } @@ -295,8 +308,9 @@ impl ProposedAdminCmd { } struct CmdEpochChecker { - // Although it's a deque, because of the characteristics of the settings from `admin_cmd_epoch_lookup`, - // the max size of admin cmd is 2, i.e. split/merge and change peer. + // Although it's a deque, because of the characteristics of the settings from + // `admin_cmd_epoch_lookup`, the max size of admin cmd is 2, i.e. split/merge and change + // peer. proposed_admin_cmd: VecDeque>, term: u64, } @@ -323,10 +337,11 @@ impl CmdEpochChecker { } } - /// Check if the proposal can be proposed on the basis of its epoch and previous proposed admin cmds. + /// Check if the proposal can be proposed on the basis of its epoch and + /// previous proposed admin cmds. /// - /// Returns None if passing the epoch check, otherwise returns a index which is the last - /// admin cmd index conflicted with this proposal. + /// Returns None if passing the epoch check, otherwise returns a index which + /// is the last admin cmd index conflicted with this proposal. fn propose_check_epoch(&mut self, req: &RaftCmdRequest, term: u64) -> Option { self.maybe_update_term(term); let (check_ver, check_conf_ver) = if !req.has_admin_request() { @@ -394,7 +409,7 @@ impl CmdEpochChecker { vec![region.to_owned()], )); cmd_resp::bind_term(&mut resp, term); - cb.invoke_with_response(resp); + cb.report_error(resp); } } else { break; @@ -453,6 +468,7 @@ pub struct PersistSnapshotResult { pub prev_region: metapb::Region, pub region: metapb::Region, pub destroy_regions: Vec, + pub for_witness: bool, } #[derive(Debug)] @@ -472,12 +488,13 @@ pub struct ReadyResult { #[derive(Debug)] /// ForceLeader process would be: -/// 1. If it's hibernated, enter wait ticks state, and wake up the peer -/// 2. Enter pre force leader state, become candidate and send request vote to all peers -/// 3. Wait for the responses of the request vote, no reject should be received. -/// 4. Enter force leader state, become leader without leader lease -/// 5. Execute recovery plan(some remove-peer commands) -/// 6. After the plan steps are all applied, exit force leader state +/// - If it's hibernated, enter wait ticks state, and wake up the peer +/// - Enter pre force leader state, become candidate and send request vote to +/// all peers +/// - Wait for the responses of the request vote, no reject should be received. +/// - Enter force leader state, become leader without leader lease +/// - Execute recovery plan(some remove-peer commands) +/// - After the plan steps are all applied, exit force leader state pub enum ForceLeaderState { WaitTicks { syncer: UnsafeRecoveryForceLeaderSyncer, @@ -494,32 +511,34 @@ pub enum ForceLeaderState { }, } -// Following shared states are used while reporting to PD for unsafe recovery and shared among -// all the regions per their life cycle. +// Following shared states are used while reporting to PD for unsafe recovery +// and shared among all the regions per their life cycle. // The work flow is like: -// 1. report phase -// start_unsafe_recovery_report -// -> broadcast wait-apply commands -// -> wait for all the peers' apply indices meet their targets -// -> broadcast fill out report commands -// -> wait for all the peers fill out the reports for themselves -// -> send a store report (through store heartbeat) -// 2. force leader phase -// dispatch force leader commands -// -> wait for all the peers that received the command become force leader -// -> start_unsafe_recovery_report -// 3. plan execution phase -// dispatch recovery plans -// -> wait for all the creates, deletes and demotes to finish, for the demotes, -// procedures are: -// -> exit joint state if it is already in joint state -// -> demote failed voters, and promote self to be a voter if it is a learner -// -> exit joint state -// -> start_unsafe_recovery_report - -// Intends to use RAII to sync unsafe recovery procedures between peers, in addition to that, -// it uses a closure to avoid having a raft router as a member variable, which is statically -// dispatched, thus needs to propagate the generics everywhere. +// 1. report phase +// - start_unsafe_recovery_report +// - broadcast wait-apply commands +// - wait for all the peers' apply indices meet their targets +// - broadcast fill out report commands +// - wait for all the peers fill out the reports for themselves +// - send a store report (through store heartbeat) +// 2. force leader phase +// - dispatch force leader commands +// - wait for all the peers that received the command become force leader +// - start_unsafe_recovery_report +// 3. plan execution phase +// - dispatch recovery plans +// - wait for all the creates, deletes and demotes to finish, for the +// demotes, procedures are: +// - exit joint state if it is already in joint state +// - demote failed voters, and promote self to be a voter if it is a +// learner +// - exit joint state +// - start_unsafe_recovery_report +// +// Intends to use RAII to sync unsafe recovery procedures between peers, in +// addition to that, it uses a closure to avoid having a raft router as a member +// variable, which is statically dispatched, thus needs to propagate the +// generics everywhere. pub struct InvokeClosureOnDrop(Box); impl fmt::Debug for InvokeClosureOnDrop { @@ -546,6 +565,85 @@ pub fn start_unsafe_recovery_report( }); } +// Propose a read index request to the raft group, return the request id and +// whether this request had dropped silently +// #[RaftstoreCommon], copied from Peer::propose_read_index +pub fn propose_read_index( + raft_group: &mut RawNode, + request: Option<&raft_cmdpb::ReadIndexRequest>, + locked: Option<&LockInfo>, +) -> (Uuid, bool) { + let last_pending_read_count = raft_group.raft.pending_read_count(); + let last_ready_read_count = raft_group.raft.ready_read_count(); + + let id = Uuid::new_v4(); + raft_group.read_index(ReadIndexContext::fields_to_bytes(id, request, locked)); + + let pending_read_count = raft_group.raft.pending_read_count(); + let ready_read_count = raft_group.raft.ready_read_count(); + ( + id, + pending_read_count == last_pending_read_count && ready_read_count == last_ready_read_count, + ) +} + +pub fn should_renew_lease( + is_leader: bool, + is_splitting: bool, + is_merging: bool, + has_force_leader: bool, +) -> bool { + // A splitting leader should not renew its lease. + // Because we split regions asynchronous, the leader may read stale results + // if splitting runs slow on the leader. + // A merging leader should not renew its lease. + // Because we merge regions asynchronous, the leader may read stale results + // if commit merge runs slow on sibling peers. + // when it enters force leader mode, should not renew lease. + is_leader && !is_splitting && !is_merging && !has_force_leader +} + +// check if the request can be amended to the last pending read? +// return true if it can. +pub fn can_amend_read( + last_pending_read: Option<&ReadIndexRequest>, + req: &RaftCmdRequest, + lease_state: LeaseState, + max_lease: TimeDuration, + now: Timespec, +) -> bool { + match lease_state { + // Here, combining the new read request with the previous one even if the lease expired + // is ok because in this case, the previous read index must be sent out with a valid + // lease instead of a suspect lease. So there must no pending transfer-leader + // proposals before or after the previous read index, and the lease can be renewed + // when get heartbeat responses. + LeaseState::Valid | LeaseState::Expired => { + if let Some(read) = last_pending_read { + let is_read_index_request = req + .get_requests() + .get(0) + .map(|req| req.has_read_index()) + .unwrap_or_default(); + // A read index request or a read with addition request always needs the + // response of checking memory lock for async + // commit, so we cannot apply the optimization here + if !is_read_index_request + && read.addition_request.is_none() + && read.propose_time + max_lease > now + { + return true; + } + } + } + // If the current lease is suspect, new read requests can't be appended into + // `pending_reads` because if the leader is transferred, the latest read could + // be dirty. + _ => {} + } + false +} + #[derive(Clone, Debug)] pub struct UnsafeRecoveryForceLeaderSyncer(Arc); @@ -591,6 +689,40 @@ impl UnsafeRecoveryExecutePlanSyncer { *self.abort.lock().unwrap() = true; } } +// Syncer only send to leader in 2nd BR restore +#[derive(Clone, Debug)] +pub struct SnapshotRecoveryWaitApplySyncer { + _closure: Arc, + abort: Arc>, +} + +impl SnapshotRecoveryWaitApplySyncer { + pub fn new(region_id: u64, sender: SyncSender) -> Self { + let thread_safe_router = Mutex::new(sender); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("region {} wait apply finished", region_id); + if *abort_clone.lock().unwrap() { + warn!("wait apply aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + + _ = router_ptr.send(region_id).map_err(|_| { + warn!("reply waitapply states failure."); + }); + })); + SnapshotRecoveryWaitApplySyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} #[derive(Clone, Debug)] pub struct UnsafeRecoveryWaitApplySyncer { @@ -675,6 +807,18 @@ impl UnsafeRecoveryFillOutReportSyncer { } } +pub enum SnapshotRecoveryState { + // This state is set by the leader peer fsm. Once set, it sync and check leader commit index + // and force forward to last index once follower appended and then it also is checked + // every time this peer applies a the last index, if the last index is met, this state is + // reset / droppeds. The syncer is droped and send the response to the invoker, triggers + // the next step of recovery process. + WaitLogApplyToLast { + target_index: u64, + syncer: SnapshotRecoveryWaitApplySyncer, + }, +} + pub enum UnsafeRecoveryState { // Stores the state that is necessary for the wait apply stage of unsafe recovery process. // This state is set by the peer fsm. Once set, it is checked every time this peer applies a @@ -696,7 +840,7 @@ pub enum UnsafeRecoveryState { Destroy(UnsafeRecoveryExecutePlanSyncer), } -#[derive(Getters)] +#[derive(Getters, MutGetters)] pub struct Peer where EK: KvEngine, @@ -718,12 +862,19 @@ where peer_cache: RefCell>, /// Record the last instant of each peer's heartbeat response. pub peer_heartbeats: HashMap, + /// Record the waiting data status of each follower or learner peer. + pub wait_data_peers: Vec, - proposals: ProposalQueue, + proposals: ProposalQueue>, leader_missing_time: Option, - #[getset(get = "pub")] + #[getset(get = "pub", get_mut = "pub")] leader_lease: Lease, - pending_reads: ReadIndexQueue, + pending_reads: ReadIndexQueue>, + /// Threshold of long uncommitted proposals. + /// + /// Note that this is a dynamically changing value. Check the + /// `has_long_uncommitted_proposals` method for details. + long_uncommitted_threshold: Duration, /// If it fails to send messages to leader. pub leader_unreachable: bool, @@ -731,15 +882,42 @@ where pub should_wake_up: bool, /// Whether this peer is destroyed asynchronously. /// If it's true, - /// 1. when merging, its data in storeMeta will be removed early by the target peer. - /// 2. all read requests must be rejected. + /// - when merging, its data in storeMeta will be removed early by the + /// target peer. + /// - all read requests must be rejected. pub pending_remove: bool, + /// Currently it's used to indicate whether the witness -> non-witess + /// convertion operation is complete. The meaning of completion is that + /// this peer must contain the applied data, then PD can consider that + /// the conversion operation is complete, and can continue to schedule + /// other operators to prevent the existence of multiple witnesses in + /// the same time period. + pub wait_data: bool, + + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, but the request may fail, so we need to save + /// the request index for retrying. + pub request_index: u64, + + /// It's used to identify the situation where the region worker is + /// generating and sending snapshots when the newly elected leader by Raft + /// applies the switch witness cmd which commited before the election. This + /// flag will prevent immediate data clearing and will be cleared after + /// the successful transfer of leadership. + pub delay_clean_data: bool, + + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, In order to avoid log lag, we need to reject + /// the leader's `MsgAppend` request unless the `term` of the `last index` + /// is less than the peer's current `term`. + pub should_reject_msgappend: bool, /// Force leader state is only used in online recovery when the majority of - /// peers are missing. In this state, it forces one peer to become leader out - /// of accordance with Raft election rule, and forbids any read/write proposals. - /// With that, we can further propose remove failed-nodes conf-change, to make - /// the Raft group forms majority and works normally later on. + /// peers are missing. In this state, it forces one peer to become leader + /// out of accordance with Raft election rule, and forbids any + /// read/write proposals. With that, we can further propose remove + /// failed-nodes conf-change, to make the Raft group forms majority and + /// works normally later on. /// /// For details, see the comment of `ForceLeaderState`. pub force_leader: Option, @@ -756,16 +934,17 @@ where /// The count of deleted keys since last reset. delete_keys_hint: u64, /// An inaccurate difference in region size after compaction. - /// It is used to trigger check split to update approximate size and keys after space reclamation - /// of deleted entries. + /// It is used to trigger check split to update approximate size and keys + /// after space reclamation of deleted entries. pub compaction_declined_bytes: u64, /// Approximate size of the region. pub approximate_size: Option, /// Approximate keys of the region. pub approximate_keys: Option, - /// Whether this region has scheduled a split check task. If we just splitted - /// the region or ingested one file which may be overlapped with the existed data, - /// reset the flag so that the region can be splitted again. + /// Whether this region has scheduled a split check task. If we just + /// splitted the region or ingested one file which may be overlapped + /// with the existed data, reset the flag so that the region can be + /// splitted again. pub may_skip_split_check: bool, /// The state for consistency check. @@ -775,19 +954,32 @@ where pub pending_request_snapshot_count: Arc, /// The index of last scheduled committed raft log. pub last_applying_idx: u64, - /// The index of last compacted raft log. It is used for the next compact log task. + /// The index of last compacted raft log. It is used for the next compact + /// log task. pub last_compacted_idx: u64, + /// Record the time of the last raft log compact, the witness should query + /// the leader periodically whether `voter_replicated_index` is updated + /// if CompactLog admin command isn't triggered for a while. + pub last_compacted_time: Instant, + /// When the peer is witness, and there is any voter lagging behind, the + /// log truncation of the witness shouldn't be triggered even if it's + /// force mode, and this item will be set to `true`, after all pending + /// compact cmds have been handled, it will be set to `false`. + pub has_pending_compact_cmd: bool, /// The index of the latest urgent proposal index. last_urgent_proposal_idx: u64, /// The index of the latest committed split command. last_committed_split_idx: u64, + /// The index of last sent snapshot + last_sent_snapshot_idx: u64, /// Approximate size of logs that is applied but not compacted yet. pub raft_log_size_hint: u64, /// The write fence index. - /// If there are pessimistic locks, PrepareMerge can be proposed after applying to - /// this index. When a pending PrepareMerge exists, no more write commands should be proposed. - /// This avoids proposing pessimistic locks that are already deleted before PrepareMerge. + /// If there are pessimistic locks, PrepareMerge can be proposed after + /// applying to this index. When a pending PrepareMerge exists, no more + /// write commands should be proposed. This avoids proposing pessimistic + /// locks that are already deleted before PrepareMerge. pub prepare_merge_fence: u64, pub pending_prepare_merge: Option, @@ -813,8 +1005,8 @@ where pub replication_mode_version: u64, /// The required replication state at current version. pub dr_auto_sync_state: DrAutoSyncState, - /// A flag that caches sync state. It's set to true when required replication - /// state is reached for current region. + /// A flag that caches sync state. It's set to true when required + /// replication state is reached for current region. pub replication_sync: bool, /// The known newest conf version and its corresponding peer list @@ -862,9 +1054,10 @@ where /// region buckets. pub region_buckets: Option, pub last_region_buckets: Option, - /// lead_transferee if the peer is in a leadership transferring. + /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, + pub snapshot_recovery_state: Option, } impl Peer @@ -876,12 +1069,14 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, peer: metapb::Peer, + wait_data: bool, ) -> Result> { - if peer.get_id() == raft::INVALID_ID { + let peer_id = peer.get_id(); + if peer_id == raft::INVALID_ID { return Err(box_err!("invalid peer id")); } @@ -895,7 +1090,6 @@ where peer.get_id(), tag.clone(), )?; - let applied_index = ps.applied_index(); let raft_cfg = raft::Config { @@ -911,21 +1105,29 @@ where skip_bcast_commit: true, pre_vote: cfg.prevote, max_committed_size_per_ready: MAX_COMMITTED_SIZE_PER_READY, + priority: if peer.is_witness { -1 } else { 0 }, ..Default::default() }; let logger = slog_global::get_global().new(slog::o!("region_id" => region.get_id())); let raft_group = RawNode::new(&raft_cfg, ps, &logger)?; + let last_index = raft_group.store().last_index(); + // In order to avoid excessive log accumulation due to the loss of pending + // compaction cmds after the witness is restarted, it will actively pull + // voter_request_index once at start. + let has_pending_compact_cmd = peer.is_witness; let mut peer = Peer { peer, region_id: region.get_id(), raft_group, raft_max_inflight_msgs: cfg.raft_max_inflight_msgs, - proposals: ProposalQueue::new(tag.clone()), - pending_reads: Default::default(), + proposals: ProposalQueue::new(region.get_id(), peer_id), + pending_reads: ReadIndexQueue::new(tag.clone()), + long_uncommitted_threshold: cfg.long_uncommitted_base_threshold.0, peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), + wait_data_peers: Vec::default(), peers_start_pending_time: vec![], down_peer_ids: vec![], size_diff_hint: 0, @@ -936,6 +1138,10 @@ where compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, + wait_data, + request_index: last_index, + delay_clean_data: false, + should_reject_msgappend: false, should_wake_up: false, force_leader: None, pending_merge_state: None, @@ -948,8 +1154,11 @@ where tag: tag.clone(), last_applying_idx: applied_index, last_compacted_idx: 0, + last_compacted_time: Instant::now(), + has_pending_compact_cmd, last_urgent_proposal_idx: u64::MAX, last_committed_split_idx: 0, + last_sent_snapshot_idx: 0, consistency_state: ConsistencyState { last_check_time: Instant::now(), index: INVALID_INDEX, @@ -981,7 +1190,7 @@ where region, applied_index, REGION_READ_PROGRESS_CAP, - tag.clone(), + peer_id, )), memtrace_raft_entries: 0, write_router: WriteRouter::new(tag), @@ -994,6 +1203,7 @@ where last_region_buckets: None, lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, + snapshot_recovery_state: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -1001,6 +1211,9 @@ where peer.raft_group.campaign()?; } + let persisted_index = peer.raft_group.raft.raft_log.persisted; + peer.mut_store().update_cache_persisted(persisted_index); + Ok(peer) } @@ -1088,9 +1301,10 @@ where pub fn maybe_append_merge_entries(&mut self, merge: &CommitMergeRequest) -> Option { let mut entries = merge.get_entries(); if entries.is_empty() { - // Though the entries is empty, it is possible that one source peer has caught up the logs - // but commit index is not updated. If other source peers are already destroyed, so the raft - // group will not make any progress, namely the source peer can not get the latest commit index anymore. + // Though the entries is empty, it is possible that one source peer has caught + // up the logs but commit index is not updated. If other source peers are + // already destroyed, so the raft group will not make any progress, namely the + // source peer can not get the latest commit index anymore. // Here update the commit index to let source apply rest uncommitted entries. return if merge.get_commit() > self.raft_group.raft.raft_log.committed { self.raft_group.raft.raft_log.commit_to(merge.get_commit()); @@ -1109,9 +1323,9 @@ where "commit_index" => self.raft_group.raft.raft_log.committed, ); if log_idx < self.raft_group.raft.raft_log.committed { - // There are maybe some logs not included in CommitMergeRequest's entries, like CompactLog, - // so the commit index may exceed the last index of the entires from CommitMergeRequest. - // If that, no need to append + // There are maybe some logs not included in CommitMergeRequest's entries, like + // CompactLog, so the commit index may exceed the last index of the entires from + // CommitMergeRequest. If that, no need to append if self.raft_group.raft.raft_log.committed - log_idx >= entries.len() as u64 { return None; } @@ -1122,9 +1336,10 @@ where let last_log = entries.last().unwrap(); if last_log.term > self.term() { - // Hack: In normal flow, when leader sends the entries, it will use a term that's not less - // than the last log term. And follower will update its states correctly. For merge, we append - // the log without raft, so we have to take care of term explicitly to get correct metadata. + // Hack: In normal flow, when leader sends the entries, it will use a term + // that's not less than the last log term. And follower will update its states + // correctly. For merge, we append the log without raft, so we have to take care + // of term explicitly to get correct metadata. info!( "become follower for new logs"; "new_log_term" => last_log.term, @@ -1145,7 +1360,8 @@ where .map(|(_, last_index)| last_index) } - /// Tries to destroy itself. Returns a job (if needed) to do more cleaning tasks. + /// Tries to destroy itself. Returns a job (if needed) to do more cleaning + /// tasks. pub fn maybe_destroy(&mut self, ctx: &PollContext) -> Option { if self.pending_remove { info!( @@ -1189,15 +1405,15 @@ where // There is no applying snapshot or snapshot is canceled so the `apply_snap_ctx` // should be set to None. - // 1. If the snapshot is canceled, the `apply_snap_ctx` should be None. - // Remember the snapshot should not be canceled and the context should - // be None only after applying snapshot in normal case. But here is safe - // becasue this peer is about to destroy and `pending_remove` will be true, - // namely no more ready will be fetched. - // 2. If there is no applying snapshot, the `apply_snap_ctx` should also be None. - // It's possible that the snapshot was canceled successfully before but - // `cancel_applying_snap` returns false. If so, at this time, `apply_snap_ctx` - // is Some and should be set to None. + // - If the snapshot is canceled, the `apply_snap_ctx` should be None. Remember + // the snapshot should not be canceled and the context should be None only + // after applying snapshot in normal case. But here is safe because this peer + // is about to destroy and `pending_remove` will be true, namely no more ready + // will be fetched. + // - If there is no applying snapshot, the `apply_snap_ctx` should also be None. + // It's possible that the snapshot was canceled successfully before but + // `cancel_applying_snap` returns false. If so, at this time, `apply_snap_ctx` + // is Some and should be set to None. self.apply_snap_ctx = None; self.pending_remove = true; @@ -1216,7 +1432,7 @@ where pub fn destroy( &mut self, engines: &Engines, - perf_context: &mut EK::PerfContext, + perf_context: &mut ER::PerfContext, keep_data: bool, pending_create_peers: &Mutex>, ) -> Result<()> { @@ -1253,14 +1469,15 @@ where panic!("{} unexpected pending states {:?}", self.tag, status); } } else { - // The status is inserted when it's created. It will be removed in following cases: - // 1. By appy worker as it fails to split due to region state key. This is - // impossible to reach this code path because the delete write batch is not - // persisted yet. - // 2. By store fsm as it fails to create peer, which is also invalid obviously. - // 3. By peer fsm after persisting snapshot, then it should be initialized. - // 4. By peer fsm after split. - // 5. By peer fsm when destroy, which should go the above branch instead. + // The status is inserted when it's created. It will be removed in following + // cases: + // - By apply worker as it fails to split due to region state key. This is + // impossible to reach this code path because the delete write batch is not + // persisted yet. + // - By store fsm as it fails to create peer, which is also invalid obviously. + // - By peer fsm after persisting snapshot, then it should be initialized. + // - By peer fsm after split. + // - By peer fsm when destroy, which should go the above branch instead. (None, false) } } else { @@ -1270,16 +1487,16 @@ where // Set Tombstone state explicitly let mut kv_wb = engines.kv.write_batch(); let mut raft_wb = engines.raft.log_batch(1024); - // Raft log gc should be flushed before being destroyed, so last_compacted_idx has to be - // the minimal index that may still have logs. + // Raft log gc should be flushed before being destroyed, so last_compacted_idx + // has to be the minimal index that may still have logs. let last_compacted_idx = self.last_compacted_idx; self.mut_store() .clear_meta(last_compacted_idx, &mut kv_wb, &mut raft_wb)?; - // StoreFsmDelegate::check_msg use both epoch and region peer list to check whether - // a message is targing a staled peer. But for an uninitialized peer, both epoch and - // peer list are empty, so a removed peer will be created again. Saving current peer - // into the peer list of region will fix this problem. + // StoreFsmDelegate::check_msg use both epoch and region peer list to check + // whether a message is targeting a staled peer. But for an uninitialized peer, + // both epoch and peer list are empty, so a removed peer will be created again. + // Saving current peer into the peer list of region will fix this problem. if !self.get_store().is_initialized() { region.mut_peers().push(self.peer.clone()); } @@ -1306,7 +1523,7 @@ where perf_context.start_observe(); engines.raft.consume(&mut raft_wb, true)?; - perf_context.report_metrics(); + perf_context.report_metrics(&[]); if self.get_store().is_initialized() && !keep_data { // If we meet panic when deleting data and raft log, the dirty data @@ -1367,8 +1584,8 @@ where let last_index = self.raft_group.raft.raft_log.last_index(); for (id, pr) in status.progress.unwrap().iter() { // Even a recent inactive node is also considered. If we put leader into sleep, - // followers or learners may not sync its logs for a long time and become unavailable. - // We choose availability instead of performance in this case. + // followers or learners may not sync its logs for a long time and become + // unavailable. We choose availability instead of performance in this case. if *id == self.peer.get_id() { continue; } @@ -1398,6 +1615,14 @@ where res.reason = "replication mode"; return res; } + if !self.disk_full_peers.is_empty() { + res.reason = "has disk full peers"; + return res; + } + if !self.wait_data_peers.is_empty() { + res.reason = "has wait data peers"; + return res; + } res.up_to_date = true; res } @@ -1423,6 +1648,8 @@ where && !self.has_unresolved_reads() // If it becomes leader, the stats is not valid anymore. && !self.is_leader() + // Keep ticking if it's waiting for snapshot. + && !self.wait_data } } @@ -1466,13 +1693,13 @@ where ) { if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() { - // Epoch version changed, disable read on the localreader for this region. + // Epoch version changed, disable read on the local reader for this region. self.leader_lease.expire_remote_lease(); } self.mut_store().set_region(region.clone()); let progress = ReadProgress::region(region); - // Always update read delegate's region to avoid stale region info after a follower - // becoming a leader. + // Always update read delegate's region to avoid stale region info after a + // follower becoming a leader. self.maybe_update_read_progress(reader, progress); // Update leader info @@ -1509,6 +1736,11 @@ where self.raft_group.raft.state == StateRole::Leader } + #[inline] + pub fn is_witness(&self) -> bool { + self.peer.is_witness + } + #[inline] pub fn get_role(&self) -> StateRole { self.raft_group.raft.state @@ -1531,7 +1763,8 @@ where self.apply_snap_ctx.is_some() || self.get_store().is_applying_snapshot() } - /// Returns `true` if the raft group has replicated a snapshot but not committed it yet. + /// Returns `true` if the raft group has replicated a snapshot but not + /// committed it yet. #[inline] pub fn has_pending_snapshot(&self) -> bool { self.get_pending_snapshot().is_some() @@ -1542,19 +1775,28 @@ where self.raft_group.snap() } - fn add_ready_metric(&self, ready: &Ready, metrics: &mut RaftReadyMetrics) { - metrics.message += ready.messages().len() as u64; - metrics.commit += ready.committed_entries().len() as u64; - metrics.append += ready.entries().len() as u64; + fn add_ready_metric(&self, ready: &Ready, metrics: &mut RaftMetrics) { + metrics.ready.message.inc_by(ready.messages().len() as u64); + metrics + .ready + .commit + .inc_by(ready.committed_entries().len() as u64); + metrics.ready.append.inc_by(ready.entries().len() as u64); if !ready.snapshot().is_empty() { - metrics.snapshot += 1; + metrics.ready.snapshot.inc(); } } - fn add_light_ready_metric(&self, light_ready: &LightReady, metrics: &mut RaftReadyMetrics) { - metrics.message += light_ready.messages().len() as u64; - metrics.commit += light_ready.committed_entries().len() as u64; + fn add_light_ready_metric(&self, light_ready: &LightReady, metrics: &mut RaftMetrics) { + metrics + .ready + .message + .inc_by(light_ready.messages().len() as u64); + metrics + .ready + .commit + .inc_by(light_ready.committed_entries().len() as u64); } #[inline] @@ -1570,8 +1812,16 @@ where ctx: &mut PollContext, msgs: Vec, ) { + let mut now = None; + let std_now = Instant::now(); for msg in msgs { let msg_type = msg.get_message().get_msg_type(); + if msg_type == MessageType::MsgSnapshot { + let snap_index = msg.get_message().get_snapshot().get_metadata().get_index(); + if snap_index > self.last_sent_snapshot_idx { + self.last_sent_snapshot_idx = snap_index; + } + } if msg_type == MessageType::MsgTimeoutNow && self.is_leader() { // After a leader transfer procedure is triggered, the lease for // the old leader may be expired earlier than usual, since a new leader @@ -1579,7 +1829,7 @@ where // network partition from the new leader. // For lease safety during leader transfer, transit `leader_lease` // to suspect. - self.leader_lease.suspect(monotonic_raw_now()); + self.leader_lease.suspect(*now.insert(monotonic_raw_now())); } let to_peer_id = msg.get_to_peer().get_id(); @@ -1595,6 +1845,28 @@ where "disk_usage" => ?msg.get_disk_usage(), ); + for (term, index) in msg + .get_message() + .get_entries() + .iter() + .map(|e| (e.get_term(), e.get_index())) + { + if let Ok(idx) = self + .proposals + .queue + .binary_search_by_key(&index, |p: &Proposal<_>| p.index) + { + let proposal = &self.proposals.queue[idx]; + if term == proposal.term { + for tracker in proposal.cb.write_trackers() { + tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { + &mut t.metrics.wf_send_proposal_nanos + }); + } + } + } + } + if let Err(e) = ctx.trans.send(msg) { // We use metrics to observe failure on production. debug!( @@ -1718,22 +1990,19 @@ where if !metrics.waterfall_metrics || self.proposals.is_empty() { return; } - let mut now = None; + let now = Instant::now(); for index in pre_persist_index + 1..=self.raft_group.raft.raft_log.persisted { - if let Some((term, times)) = self.proposals.find_request_times(index) { + if let Some((term, trackers)) = self.proposals.find_trackers(index) { if self .get_store() .term(index) .map(|t| t == term) .unwrap_or(false) { - if now.is_none() { - now = Some(TiInstant::now()); - } - for t in times { - metrics - .wf_persist_log - .observe(duration_to_sec(now.unwrap().saturating_duration_since(*t))); + for tracker in trackers { + tracker.observe(now, &metrics.wf_persist_log, |t| { + &mut t.metrics.wf_persist_log_nanos + }); } } } @@ -1744,25 +2013,26 @@ where if !metrics.waterfall_metrics || self.proposals.is_empty() { return; } - let mut now = None; + let now = Instant::now(); for index in pre_commit_index + 1..=self.raft_group.raft.raft_log.committed { - if let Some((term, times)) = self.proposals.find_request_times(index) { + if let Some((term, trackers)) = self.proposals.find_trackers(index) { if self .get_store() .term(index) .map(|t| t == term) .unwrap_or(false) { - if now.is_none() { - now = Some(TiInstant::now()); - } - let hist = if index <= self.raft_group.raft.raft_log.persisted { + let commit_persisted = index <= self.raft_group.raft.raft_log.persisted; + let hist = if commit_persisted { &metrics.wf_commit_log } else { &metrics.wf_commit_not_persist_log }; - for t in times { - hist.observe(duration_to_sec(now.unwrap().saturating_duration_since(*t))); + for tracker in trackers { + tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); } } } @@ -1774,6 +2044,7 @@ where if !self.is_leader() { self.peer_heartbeats.clear(); self.peers_start_pending_time.clear(); + self.wait_data_peers.clear(); return; } @@ -1799,7 +2070,6 @@ where if p.get_id() == self.peer.get_id() { continue; } - // TODO if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { let elapsed = instant.saturating_elapsed(); if elapsed >= max_duration { @@ -1824,6 +2094,12 @@ where let status = self.raft_group.status(); let truncated_idx = self.get_store().truncated_index(); + for peer_id in &self.wait_data_peers { + if let Some(p) = self.get_peer_from_cache(*peer_id) { + pending_peers.push(p); + } + } + if status.progress.is_none() { return pending_peers; } @@ -1843,11 +2119,13 @@ where // 1. Current leader hasn't communicated with this peer. // 2. This peer does not exist yet(maybe it is created but not initialized) // - // The correctness of region merge depends on the fact that all target peers must exist during merging. - // (PD rely on `pending_peers` to check whether all target peers exist) + // The correctness of region merge depends on the fact that all target peers + // must exist during merging. (PD rely on `pending_peers` to check whether all + // target peers exist) // // So if the `matched` is 0, it must be a pending peer. - // It can be ensured because `truncated_index` must be greater than `RAFT_INIT_LOG_INDEX`(5). + // It can be ensured because `truncated_index` must be greater than + // `RAFT_INIT_LOG_INDEX`(5). if progress.matched < truncated_idx { if let Some(p) = self.get_peer_from_cache(id) { pending_peers.push(p); @@ -1898,6 +2176,9 @@ where if self.peers_start_pending_time[i].0 != peer_id { continue; } + if self.wait_data_peers.contains(&peer_id) { + continue; + } let truncated_idx = self.raft_group.store().truncated_index(); if let Some(progress) = self.raft_group.raft.prs().get(peer_id) { if progress.matched >= truncated_idx { @@ -1967,8 +2248,8 @@ where // Updates the `leader_missing_time` according to the current state. // // If we are checking this it means we suspect the leader might be missing. - // Mark down the time when we are called, so we can check later if it's been longer than it - // should be. + // Mark down the time when we are called, so we can check later if it's been + // longer than it should be. match self.leader_missing_time { None => { self.leader_missing_time = Instant::now().into(); @@ -2026,12 +2307,15 @@ where // prewrites or commits will be just a waste. self.last_urgent_proposal_idx = self.raft_group.raft.raft_log.last_index(); self.raft_group.skip_bcast_commit(false); + self.last_sent_snapshot_idx = self.raft_group.raft.raft_log.last_index(); // A more recent read may happen on the old leader. So max ts should // be updated after a peer becomes leader. self.require_updating_max_ts(&ctx.pd_scheduler); // Init the in-memory pessimistic lock table when the peer becomes leader. self.activate_in_memory_pessimistic_locks(); + // Exit entry cache warmup state when the peer becomes leader. + self.mut_store().clear_entry_cache_warmup_state(); if !ctx.store_disk_usages.is_empty() { self.refill_disk_full_peers(ctx); @@ -2047,6 +2331,10 @@ where self.mut_store().cancel_generating_snap(None); self.clear_disk_full_peers(ctx); self.clear_in_memory_pessimistic_locks(); + if self.peer.is_witness && self.delay_clean_data { + let _ = self.get_store().clear_data(); + self.delay_clean_data = false; + } } _ => {} } @@ -2058,6 +2346,8 @@ where leader_id: ss.leader_id, prev_lead_transferee: self.lead_transferee, vote: self.raft_group.raft.vote, + initialized: self.is_initialized(), + peer_id: self.peer.get_id(), }, ); self.cmd_epoch_checker.maybe_update_term(self.term()); @@ -2069,27 +2359,30 @@ where self.lead_transferee = self.raft_group.raft.lead_transferee.unwrap_or_default(); } - /// Correctness depends on the order between calling this function and notifying other peers - /// the new commit index. - /// It is due to the interaction between lease and split/merge.(details are decribed below) + /// Correctness depends on the order between calling this function and + /// notifying other peers the new commit index. + /// It is due to the interaction between lease and split/merge.(details are + /// described below) /// - /// Note that in addition to the hearbeat/append msg, the read index response also can notify - /// other peers the new commit index. There are three place where TiKV handles read index resquest. - /// The first place is in raft-rs, so it's like hearbeat/append msg, call this function and - /// then send the response. The second place is in `Step`, we should use the commit index - /// of `PeerStorage` which is the greatest commit index that can be observed outside. - /// The third place is in `read_index`, handle it like the second one. + /// Note that in addition to the heartbeat/append msg, the read index + /// response also can notify other peers the new commit index. There are + /// three place where TiKV handles read index request. The first place is in + /// raft-rs, so it's like heartbeat/append msg, call this function and then + /// send the response. The second place is in `Step`, we should use the + /// commit index of `PeerStorage` which is the greatest commit index that + /// can be observed outside. The third place is in `read_index`, handle it + /// like the second one. fn on_leader_commit_idx_changed(&mut self, pre_commit_index: u64, commit_index: u64) { if commit_index <= pre_commit_index || !self.is_leader() { return; } - // The admin cmds in `CmdEpochChecker` are proposed by the current leader so we can - // use it to get the split/prepare-merge cmds which was committed just now. + // The admin cmds in `CmdEpochChecker` are proposed by the current leader so we + // can use it to get the split/prepare-merge cmds which was committed just now. - // BatchSplit and Split cmd are mutually exclusive because they both change epoch's - // version so only one of them can be proposed and the other one will be rejected - // by `CmdEpochChecker`. + // BatchSplit and Split cmd are mutually exclusive because they both change + // epoch's version so only one of them can be proposed and the other one will be + // rejected by `CmdEpochChecker`. let last_split_idx = self .cmd_epoch_checker .last_cmd_index(AdminCmdType::BatchSplit) @@ -2146,9 +2439,14 @@ where // by apply worker. So we have to wait here. // Please note that commit_index can't be used here. When applying a snapshot, // a stale heartbeat can make the leader think follower has already applied - // the snapshot, and send remaining log entries, which may increase commit_index. + // the snapshot, and send remaining log entries, which may increase + // commit_index. + // + // If it's witness before, but a command changes it to non-witness, it will stop + // applying all following command, therefore, add the judgment of `wait_data` to + // avoid applying snapshot is also blocked. // TODO: add more test - self.last_applying_idx == self.get_store().applied_index() + (self.last_applying_idx == self.get_store().applied_index() || self.wait_data) // Requesting snapshots also triggers apply workers to write // apply states even if there is no pending committed entry. // TODO: Instead of sharing the counter, we should apply snapshots @@ -2160,9 +2458,9 @@ where fn ready_to_handle_read(&self) -> bool { // TODO: It may cause read index to wait a long time. - // There may be some values that are not applied by this leader yet but the old leader, - // if applied_index_term isn't equal to current term. - self.get_store().applied_index_term() == self.term() + // There may be some values that are not applied by this leader yet but the old + // leader, if applied_term isn't equal to current term. + self.get_store().applied_term() == self.term() // There may be stale read if the old leader splits really slow, // the new region may already elected a new leader while // the old leader still think it owns the split range. @@ -2176,9 +2474,9 @@ where fn ready_to_handle_unsafe_replica_read(&self, read_index: u64) -> bool { // Wait until the follower applies all values before the read. There is still a - // problem if the leader applies fewer values than the follower, the follower read - // could get a newer value, and after that, the leader may read a stale value, - // which violates linearizability. + // problem if the leader applies fewer values than the follower, the follower + // read could get a newer value, and after that, the leader may read a + // stale value, which violates linearizability. self.get_store().applied_index() >= read_index // If it is in pending merge state(i.e. applied PrepareMerge), the data may be stale. // TODO: Add a test to cover this case @@ -2189,12 +2487,12 @@ where } #[inline] - fn is_splitting(&self) -> bool { + pub fn is_splitting(&self) -> bool { self.last_committed_split_idx > self.get_store().applied_index() } #[inline] - fn is_merging(&self) -> bool { + pub fn is_merging(&self) -> bool { self.last_committed_prepare_merge_idx > self.get_store().applied_index() || self.pending_merge_state.is_some() } @@ -2238,17 +2536,19 @@ where /// Returns whether it's valid to handle raft ready. /// /// The snapshot process order would be: - /// 1. Get the snapshot from the ready - /// 2. Wait for the notify of persisting this ready through `Peer::on_persist_ready` - /// 3. Schedule the snapshot task to region worker through `schedule_applying_snapshot` - /// 4. Wait for applying snapshot to complete(`check_snap_status`) + /// - Get the snapshot from the ready + /// - Wait for the notify of persisting this ready through + /// `Peer::on_persist_ready` + /// - Schedule the snapshot task to region worker through + /// `schedule_applying_snapshot` + /// - Wait for applying snapshot to complete(`check_snap_status`) /// Then it's valid to handle the next ready. fn check_snap_status(&mut self, ctx: &mut PollContext) -> bool { if let Some(snap_ctx) = self.apply_snap_ctx.as_ref() { if !snap_ctx.scheduled { // There is a snapshot from ready but it is not scheduled because the ready has - // not been persisted yet. We should wait for the notification of persisting ready - // and do not get a new ready. + // not been persisted yet. We should wait for the notification of persisting + // ready and do not get a new ready. return false; } } @@ -2301,7 +2601,11 @@ where if self.unsafe_recovery_state.is_some() { debug!("unsafe recovery finishes applying a snapshot"); - self.unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false); + self.unsafe_recovery_maybe_finish_wait_apply(/* force= */ false); + } + if self.snapshot_recovery_state.is_some() { + debug!("snapshot recovery finishes applying a snapshot"); + self.snapshot_recovery_maybe_finish_wait_apply(false); } } // If `apply_snap_ctx` is none, it means this snapshot does not @@ -2312,9 +2616,19 @@ where // i.e. call `RawNode::advance_apply_to`. self.post_pending_read_index_on_replica(ctx); // Resume `read_progress` + self.update_read_progress(ctx, ReadProgress::WaitData(false)); self.read_progress.resume(); // Update apply index to `last_applying_idx` - self.read_progress.update_applied(self.last_applying_idx); + self.read_progress + .update_applied(self.last_applying_idx, &ctx.coprocessor_host); + if self.wait_data { + self.notify_leader_the_peer_is_available(ctx); + ctx.apply_router + .schedule_task(self.region_id, ApplyTask::Recover(self.region_id)); + self.wait_data = false; + self.should_reject_msgappend = false; + return false; + } } CheckApplyingSnapStatus::Idle => { // FIXME: It's possible that the snapshot applying task is canceled. @@ -2331,6 +2645,26 @@ where true } + fn notify_leader_the_peer_is_available( + &mut self, + ctx: &mut PollContext, + ) { + fail_point!("ignore notify leader the peer is available", |_| {}); + let leader_id = self.leader_id(); + let leader = self.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityResponse); + msg.wait_data = false; + self.send_extra_message(msg, &mut ctx.trans, &leader); + info!( + "notify leader the peer is available"; + "region id" => self.region().get_id(), + "peer id" => self.peer.id + ); + } + } + pub fn handle_raft_ready_append( &mut self, ctx: &mut PollContext, @@ -2369,9 +2703,9 @@ where } let meta = ctx.store_meta.lock().unwrap(); - // For merge process, the stale source peer is destroyed asynchronously when applying - // snapshot or creating new peer. So here checks whether there is any overlap, if so, - // wait and do not handle raft ready. + // For merge process, the stale source peer is destroyed asynchronously when + // applying snapshot or creating new peer. So here checks whether there is any + // overlap, if so, wait and do not handle raft ready. if let Some(wait_destroy_regions) = meta.atomic_snap_regions.get(&self.region_id) { for (source_region_id, is_ready) in wait_destroy_regions { if !is_ready { @@ -2426,12 +2760,12 @@ where let mut ready = self.raft_group.ready(); - self.add_ready_metric(&ready, &mut ctx.raft_metrics.ready); + self.add_ready_metric(&ready, &mut ctx.raft_metrics); // Update it after unstable entries pagination is introduced. debug_assert!(ready.entries().last().map_or_else( || true, - |entry| entry.index == self.raft_group.raft.raft_log.last_index() + |entry| entry.index == self.raft_group.raft.raft_log.last_index(), )); if self.memtrace_raft_entries != 0 { MEMTRACE_RAFT_ENTRIES.trace(TraceEvent::Sub(self.memtrace_raft_entries)); @@ -2483,20 +2817,17 @@ where let state_role = ready.ss().map(|ss| ss.raft_state); let has_new_entries = !ready.entries().is_empty(); - let mut request_times = vec![]; + let mut trackers = vec![]; if ctx.raft_metrics.waterfall_metrics { - let mut now = None; + let now = Instant::now(); for entry in ready.entries() { - if let Some((term, times)) = self.proposals.find_request_times(entry.get_index()) { + if let Some((term, times)) = self.proposals.find_trackers(entry.get_index()) { if entry.term == term { - request_times.extend_from_slice(times); - if now.is_none() { - now = Some(TiInstant::now()); - } - for t in times { - ctx.raft_metrics.wf_send_to_queue.observe(duration_to_sec( - now.unwrap().saturating_duration_since(*t), - )); + for tracker in times { + trackers.push(*tracker); + tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { + &mut t.metrics.wf_send_to_queue_nanos + }); } } } @@ -2518,13 +2849,13 @@ where let persisted_msgs = ready.take_persisted_messages(); let mut has_write_ready = false; match &res { - HandleReadyResult::SendIOTask | HandleReadyResult::Snapshot { .. } => { + HandleReadyResult::SendIoTask | HandleReadyResult::Snapshot { .. } => { if !persisted_msgs.is_empty() { task.messages = self.build_raft_messages(ctx, persisted_msgs); } - if !request_times.is_empty() { - task.request_times = request_times; + if !trackers.is_empty() { + task.trackers = trackers; } if let Some(write_worker) = &mut ctx.sync_write_worker { @@ -2549,7 +2880,7 @@ where self.raft_group.advance_append_async(ready); } } - HandleReadyResult::NoIOTask => { + HandleReadyResult::NoIoTask => { if let Some(last) = self.unpersisted_readies.back_mut() { // Attach to the last unpersisted ready so that it can be considered to be // persisted with the last ready at the same time. @@ -2566,8 +2897,9 @@ where last.raft_msgs.push(persisted_msgs); } } else { - // If this ready don't need to be persisted and there is no previous unpersisted ready, - // we can safely consider it is persisted so the persisted msgs can be sent immediately. + // If this ready don't need to be persisted and there is no previous unpersisted + // ready, we can safely consider it is persisted so the persisted msgs can be + // sent immediately. self.persisted_number = ready_number; if !persisted_msgs.is_empty() { @@ -2576,11 +2908,11 @@ where self.send_raft_messages(ctx, msgs); } - // The commit index and messages of light ready should be empty because no data needs - // to be persisted. + // The commit index and messages of light ready should be empty because no data + // needs to be persisted. let mut light_rd = self.raft_group.advance_append(ready); - self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); + self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics); if let Some(idx) = light_rd.commit_index() { panic!( @@ -2604,13 +2936,20 @@ where } } - if let HandleReadyResult::Snapshot { + if let HandleReadyResult::Snapshot(box HandleSnapshotResult { msgs, snap_region, destroy_regions, last_first_index, - } = res + for_witness, + }) = res { + if for_witness { + // inform next round to check apply status + ctx.router + .send_casual_msg(snap_region.get_id(), CasualMessage::SnapshotApplied) + .unwrap(); + } // When applying snapshot, there is no log applied and not compacted yet. self.raft_log_size_hint = 0; @@ -2622,6 +2961,7 @@ where prev_region: self.region().clone(), region: snap_region, destroy_regions, + for_witness, }), }); if self.last_compacted_idx == 0 && last_first_index >= RAFT_INIT_LOG_INDEX { @@ -2673,9 +3013,9 @@ where .find_propose_time(entry.get_term(), entry.get_index()); if let Some(propose_time) = propose_time { // We must renew current_time because this value may be created a long time ago. - // If we do not renew it, this time may be smaller than propose_time of a command, - // which was proposed in another thread while this thread receives its AppendEntriesResponse - // and is ready to calculate its commit-log-duration. + // If we do not renew it, this time may be smaller than propose_time of a + // command, which was proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. ctx.current_time.replace(monotonic_raw_now()); ctx.raft_metrics.commit_log.observe(duration_to_sec( (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), @@ -2725,6 +3065,7 @@ where } else { vec![] }; + // Note that the `commit_index` and `commit_term` here may be used to // forward the commit index. So it must be less than or equal to persist // index. @@ -2733,6 +3074,7 @@ where self.raft_group.raft.raft_log.persisted, ); let commit_term = self.get_store().term(commit_index).unwrap(); + let mut apply = Apply::new( self.peer_id(), self.region_id, @@ -2748,7 +3090,7 @@ where .trace_cached_entries(apply.entries[0].clone()); if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. - self.mut_store().evict_cache(false); + self.mut_store().evict_entry_cache(false); } ctx.apply_router .schedule_task(self.region_id, ApplyTask::apply(apply)); @@ -2756,6 +3098,57 @@ where fail_point!("after_send_to_apply_1003", self.peer_id() == 1003, |_| {}); } + /// Check long uncommitted proposals and log some info to help find why. + pub fn check_long_uncommitted_proposals(&mut self, ctx: &mut PollContext) { + if self.has_long_uncommitted_proposals(ctx) { + let status = self.raft_group.status(); + let mut buffer: Vec<(u64, u64, u64)> = Vec::new(); + if let Some(prs) = status.progress { + for (id, p) in prs.iter() { + buffer.push((*id, p.commit_group_id, p.matched)); + } + } + warn!( + "found long uncommitted proposals"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + "progress" => ?buffer, + "cache_first_index" => ?self.get_store().entry_cache_first_index(), + "next_turn_threshold" => ?self.long_uncommitted_threshold, + ); + } + } + + /// Check if there is long uncommitted proposal. + /// + /// This will increase the threshold when a long uncommitted proposal is + /// detected, and reset the threshold when there is no long uncommitted + /// proposal. + fn has_long_uncommitted_proposals(&mut self, ctx: &mut PollContext) -> bool { + let mut has_long_uncommitted = false; + let base_threshold = ctx.cfg.long_uncommitted_base_threshold.0; + if let Some(propose_time) = self.proposals.oldest().and_then(|p| p.propose_time) { + // When a proposal was proposed with this ctx before, the current_time can be + // some. + let current_time = *ctx.current_time.get_or_insert_with(monotonic_raw_now); + let elapsed = match (current_time - propose_time).to_std() { + Ok(elapsed) => elapsed, + Err(_) => return false, + }; + // Increase the threshold for next turn when a long uncommitted proposal is + // detected. + if elapsed >= self.long_uncommitted_threshold { + has_long_uncommitted = true; + self.long_uncommitted_threshold += base_threshold; + } else if elapsed < base_threshold { + self.long_uncommitted_threshold = base_threshold; + } + } else { + self.long_uncommitted_threshold = base_threshold; + } + has_long_uncommitted + } + fn on_persist_snapshot( &mut self, ctx: &mut PollContext, @@ -2791,6 +3184,8 @@ where "after" => ?peer, ); self.peer = peer; + self.raft_group + .set_priority(if self.peer.is_witness { -1 } else { 0 }); }; self.activate(ctx); @@ -2850,7 +3245,8 @@ where self.mut_store().update_cache_persisted(persist_index); if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { - // forward commit index, the committed entries will be applied in the next raft base tick round + // forward commit index, the committed entries will be applied in the next raft + // base tick round self.maybe_force_forward_commit_index(); } } @@ -2892,12 +3288,13 @@ where let persist_index = self.raft_group.raft.raft_log.persisted; if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { - // forward commit index, the committed entries will be applied in the next raft base tick round + // forward commit index, the committed entries will be applied in the next raft + // base tick round self.maybe_force_forward_commit_index(); } self.mut_store().update_cache_persisted(persist_index); - self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); + self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics); if let Some(commit_index) = light_rd.commit_index() { let pre_commit_index = self.get_store().commit_index(); @@ -2936,7 +3333,7 @@ where fn response_read( &self, - read: &mut ReadIndexRequest, + read: &mut ReadIndexRequest>, ctx: &mut PollContext, replica_read: bool, ) { @@ -2947,7 +3344,14 @@ where "peer_id" => self.peer.get_id(), ); RAFT_READ_INDEX_PENDING_COUNT.sub(read.cmds().len() as i64); + let time = monotonic_raw_now(); for (req, cb, mut read_index) in read.take_cmds().drain(..) { + cb.read_tracker().map(|tracker| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { + t.metrics.read_index_confirm_wait_nanos = + (time - read.propose_time).to_std().unwrap().as_nanos() as u64; + }) + }); // leader reports key is locked if let Some(locked) = read.locked.take() { let mut response = raft_cmdpb::Response::default(); @@ -2992,7 +3396,8 @@ where } } - /// Responses to the ready read index request on the replica, the replica is not a leader. + /// Responses to the ready read index request on the replica, the replica is + /// not a leader. fn post_pending_read_index_on_replica(&mut self, ctx: &mut PollContext) { while let Some(mut read) = self.pending_reads.pop_front() { // The response of this read index request is lost, but we need it for @@ -3006,7 +3411,7 @@ where info!( "re-propose read index request because the response is lost"; "region_id" => self.region_id, - "peer_id" => self.peer.get_id(), + "peer_id" => self.peer_id(), ); RAFT_READ_INDEX_PENDING_COUNT.sub(1); self.send_read_command(ctx, read_cmd); @@ -3071,13 +3476,13 @@ where // update the `read_index` of read request that before this successful // `ready`. if !self.is_leader() { - // NOTE: there could still be some pending reads proposed by the peer when it was - // leader. They will be cleared in `clear_uncommitted_on_role_change` later in - // the function. + // NOTE: there could still be some pending reads proposed by the peer when it + // was leader. They will be cleared in `clear_uncommitted_on_role_change` later + // in the function. self.pending_reads.advance_replica_reads(states); self.post_pending_read_index_on_replica(ctx); } else { - self.pending_reads.advance_leader_reads(&self.tag, states); + self.pending_reads.advance_leader_reads(states); propose_time = self.pending_reads.last_ready().map(|r| r.propose_time); if self.ready_to_handle_read() { while let Some(mut read) = self.pending_reads.pop_front() { @@ -3106,7 +3511,7 @@ where &mut self, ctx: &mut PollContext, apply_state: RaftApplyState, - applied_index_term: u64, + applied_term: u64, apply_metrics: &ApplyMetrics, ) -> bool { let mut has_ready = false; @@ -3126,12 +3531,12 @@ where if !self.is_leader() { self.mut_store() - .compact_cache_to(apply_state.applied_index + 1); + .compact_entry_cache(apply_state.applied_index + 1); } - let progress_to_be_updated = self.mut_store().applied_index_term() != applied_index_term; - self.mut_store().set_applied_state(apply_state); - self.mut_store().set_applied_term(applied_index_term); + let progress_to_be_updated = self.mut_store().applied_term() != applied_term; + self.mut_store().set_apply_state(apply_state); + self.mut_store().set_applied_term(applied_term); self.peer_stat.written_keys += apply_metrics.written_keys; self.peer_stat.written_bytes += apply_metrics.written_bytes; @@ -3151,15 +3556,16 @@ where } self.pending_reads.gc(); - self.read_progress.update_applied(applied_index); + self.read_progress + .update_applied(applied_index, &ctx.coprocessor_host); - // Only leaders need to update applied_index_term. + // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { - if applied_index_term == self.term() { + if applied_term == self.term() { ctx.coprocessor_host .on_applied_current_term(StateRole::Leader, self.region()); } - let progress = ReadProgress::applied_index_term(applied_index_term); + let progress = ReadProgress::applied_term(applied_term); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id).unwrap(); self.maybe_update_read_progress(reader, progress); @@ -3189,31 +3595,16 @@ where progress: Option, ) { // A nonleader peer should never has leader lease. - let read_progress = if !self.is_leader() { - None - } else if self.is_splitting() { - // A splitting leader should not renew its lease. - // Because we split regions asynchronous, the leader may read stale results - // if splitting runs slow on the leader. - debug!( - "prevents renew lease while splitting"; - "region_id" => self.region_id, - "peer_id" => self.peer.get_id(), - ); - None - } else if self.is_merging() { - // A merging leader should not renew its lease. - // Because we merge regions asynchronous, the leader may read stale results - // if commit merge runs slow on sibling peers. - debug!( - "prevents renew lease while merging"; - "region_id" => self.region_id, - "peer_id" => self.peer.get_id(), - ); + let read_progress = if !should_renew_lease( + self.is_leader(), + self.is_splitting(), + self.is_merging(), + self.force_leader.is_some(), + ) { None - } else if self.force_leader.is_some() { + } else if self.region().is_in_flashback { debug!( - "prevents renew lease while in force leader state"; + "prevents renew lease while in flashback state"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), ); @@ -3250,6 +3641,16 @@ where reader.update(progress); } + pub fn update_read_progress( + &self, + ctx: &mut PollContext, + progress: ReadProgress, + ) { + let mut meta = ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&self.region_id).unwrap(); + self.maybe_update_read_progress(reader, progress); + } + pub fn maybe_campaign(&mut self, parent_is_leader: bool) -> bool { if self.region().get_peers().len() <= 1 { // The peer campaigned when it was created, no need to do it again. @@ -3266,22 +3667,22 @@ where true } - /// Propose a request. + /// Proposes a request. /// - /// Return true means the request has been proposed successfully. + /// Return whether the request has been proposed successfully. pub fn propose( &mut self, ctx: &mut PollContext, mut cb: Callback, req: RaftCmdRequest, mut err_resp: RaftCmdResponse, - disk_full_opt: DiskFullOpt, + mut disk_full_opt: DiskFullOpt, ) -> bool { if self.pending_remove { return false; } - ctx.raft_metrics.propose.all += 1; + ctx.raft_metrics.propose.all.inc(); let req_admin_cmd_type = if !req.has_admin_request() { None @@ -3302,53 +3703,11 @@ where } Ok(RequestPolicy::ProposeNormal) => { // For admin cmds, only region split/merge comes here. - let mut stores = Vec::new(); - let mut opt = disk_full_opt; - let mut maybe_transfer_leader = false; if req.has_admin_request() { - opt = DiskFullOpt::AllowedOnAlmostFull; - } - if self.check_proposal_normal_with_disk_usage( - ctx, - opt, - &mut stores, - &mut maybe_transfer_leader, - ) { - self.propose_normal(ctx, req) - } else { - // If leader node is disk full, try to transfer leader to a node with disk usage normal to - // keep write availablity not downback. - // if majority node is disk full, to transfer leader or not is not necessary. - // Note: Need to exclude learner node. - if maybe_transfer_leader && !self.disk_full_peers.majority { - let target_peer = self - .get_store() - .region() - .get_peers() - .iter() - .find(|x| { - !self.disk_full_peers.has(x.get_id()) - && x.get_id() != self.peer.get_id() - && !self.down_peer_ids.contains(&x.get_id()) - && !matches!(x.get_role(), PeerRole::Learner) - }) - .cloned(); - if let Some(p) = target_peer { - debug!( - "try to transfer leader because of current leader disk full: region id = {}, peer id = {}; target peer id = {}", - self.region_id, - self.peer.get_id(), - p.get_id() - ); - self.pre_transfer_leader(&p); - } - } - let errmsg = format!( - "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", - disk_full_opt, ctx.self_disk_usage - ); - Err(Error::DiskFull(stores, errmsg)) + disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; } + self.check_normal_proposal_with_disk_full_opt(ctx, disk_full_opt) + .and_then(|_| self.propose_normal(ctx, req)) } Ok(RequestPolicy::ProposeConfChange) => self.propose_conf_change(ctx, &req), Err(e) => Err(e), @@ -3372,8 +3731,9 @@ where Ok(Either::Left(idx)) => { let has_applied_to_current_term = self.has_applied_to_current_term(); if has_applied_to_current_term { - // After this peer has applied to current term and passed above checking including `cmd_epoch_checker`, - // we can safely guarantee that this proposal will be committed if there is no abnormal leader transfer + // After this peer has applied to current term and passed above checking + // including `cmd_epoch_checker`, we can safely guarantee + // that this proposal will be committed if there is no abnormal leader transfer // in the near future. Thus proposed callback can be called. cb.invoke_proposed(); } @@ -3392,6 +3752,7 @@ where cb, propose_time: None, must_pass_epoch_check: has_applied_to_current_term, + sent: false, }; if let Some(cmd_type) = req_admin_cmd_type { self.cmd_epoch_checker @@ -3427,7 +3788,7 @@ where fn post_propose( &mut self, poll_ctx: &mut PollContext, - mut p: Proposal, + mut p: Proposal>, ) { // Try to renew leader lease on every consistent read/write request. if poll_ctx.current_time.is_none() { @@ -3438,136 +3799,6 @@ where self.proposals.push(p); } - // TODO: set higher election priority of voter/incoming voter than demoting voter - /// Validate the `ConfChange` requests and check whether it's safe to - /// propose these conf change requests. - /// It's safe iff at least the quorum of the Raft group is still healthy - /// right after all conf change is applied. - /// If 'allow_remove_leader' is false then the peer to be removed should - /// not be the leader. - fn check_conf_change( - &mut self, - ctx: &mut PollContext, - change_peers: &[ChangePeerRequest], - cc: &impl ConfChangeI, - ) -> Result<()> { - // Check whether current joint state can handle this request - let mut after_progress = self.check_joint_state(cc)?; - let current_progress = self.raft_group.status().progress.unwrap().clone(); - let kind = ConfChangeKind::confchange_kind(change_peers.len()); - - if kind == ConfChangeKind::LeaveJoint { - if self.peer.get_role() == PeerRole::DemotingVoter && !self.is_force_leader() { - return Err(box_err!( - "{} ignore leave joint command that demoting leader", - self.tag - )); - } - // Leaving joint state, skip check - return Ok(()); - } - - // Check whether this request is valid - let mut check_dup = HashSet::default(); - let mut only_learner_change = true; - let current_voter = current_progress.conf().voters().ids(); - for cp in change_peers.iter() { - let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); - match (change_type, peer.get_role()) { - (ConfChangeType::RemoveNode, PeerRole::Voter) if kind != ConfChangeKind::Simple => { - return Err(box_err!( - "{} invalid conf change request: {:?}, can not remove voter directly", - self.tag, - cp - )); - } - (ConfChangeType::RemoveNode, _) - | (ConfChangeType::AddNode, PeerRole::Voter) - | (ConfChangeType::AddLearnerNode, PeerRole::Learner) => {} - _ => { - return Err(box_err!( - "{} invalid conf change request: {:?}", - self.tag, - cp - )); - } - } - - if !check_dup.insert(peer.get_id()) { - return Err(box_err!( - "{} invalid conf change request, have multiple commands for the same peer {}", - self.tag, - peer.get_id() - )); - } - - if peer.get_id() == self.peer_id() - && (change_type == ConfChangeType::RemoveNode - // In Joint confchange, the leader is allowed to be DemotingVoter - || (kind == ConfChangeKind::Simple - && change_type == ConfChangeType::AddLearnerNode)) - && !ctx.cfg.allow_remove_leader() - { - return Err(box_err!( - "{} ignore remove leader or demote leader", - self.tag - )); - } - - if current_voter.contains(peer.get_id()) || change_type == ConfChangeType::AddNode { - only_learner_change = false; - } - } - - // Multiple changes that only effect learner will not product `IncommingVoter` or `DemotingVoter` - // after apply, but raftstore layer and PD rely on these roles to detect joint state - if kind != ConfChangeKind::Simple && only_learner_change { - return Err(box_err!( - "{} invalid conf change request, multiple changes that only effect learner", - self.tag - )); - } - - let promoted_commit_index = after_progress.maximal_committed_index().0; - if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. - || promoted_commit_index >= self.get_store().truncated_index() || self.force_leader.is_some() - { - return Ok(()); - } - - PEER_ADMIN_CMD_COUNTER_VEC - .with_label_values(&["conf_change", "reject_unsafe"]) - .inc(); - - // Waking it up to replicate logs to candidate. - self.should_wake_up = true; - Err(box_err!( - "{} unsafe to perform conf change {:?}, before: {:?}, after: {:?}, truncated index {}, promoted commit index {}", - self.tag, - change_peers, - current_progress.conf().to_conf_state(), - after_progress.conf().to_conf_state(), - self.get_store().truncated_index(), - promoted_commit_index - )) - } - - /// Check if current joint state can handle this confchange - fn check_joint_state(&mut self, cc: &impl ConfChangeI) -> Result { - let cc = &cc.as_v2(); - let mut prs = self.raft_group.status().progress.unwrap().clone(); - let mut changer = Changer::new(&prs); - let (cfg, changes) = if cc.leave_joint() { - changer.leave_joint()? - } else if let Some(auto_leave) = cc.enter_joint() { - changer.enter_joint(auto_leave, &cc.changes)? - } else { - changer.simple(&cc.changes)? - }; - prs.apply_conf(cfg, changes, self.raft_group.raft.raft_log.last_index()); - Ok(prs) - } - pub fn transfer_leader(&mut self, peer: &metapb::Peer) { info!( "transfer leader"; @@ -3595,13 +3826,15 @@ where // Broadcast heartbeat to make sure followers commit the entries immediately. // It's only necessary to ping the target peer, but ping all for simplicity. self.raft_group.ping(); + let mut msg = eraftpb::Message::new(); msg.set_to(peer.get_id()); msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); msg.set_from(self.peer_id()); + msg.set_index(self.get_store().entry_cache_first_index().unwrap_or(0)); // log term here represents the term of last log. For leader, the term of last - // log is always its current term. Not just set term because raft library forbids - // setting it for MsgTransferLeader messages. + // log is always its current term. Not just set term because raft library + // forbids setting it for MsgTransferLeader messages. msg.set_log_term(self.term()); self.raft_group.raft.msgs.push(msg); true @@ -3653,7 +3886,7 @@ where req: RaftCmdRequest, cb: Callback, ) { - ctx.raft_metrics.propose.local_read += 1; + ctx.raft_metrics.propose.local_read.inc(); cb.invoke_read(self.handle_read(ctx, req, false, Some(self.get_store().commit_index()))) } @@ -3690,8 +3923,9 @@ where self.pending_reads.has_unresolved() } - /// `ReadIndex` requests could be lost in network, so on followers commands could queue in - /// `pending_reads` forever. Sending a new `ReadIndex` periodically can resolve this. + /// `ReadIndex` requests could be lost in network, so on followers commands + /// could queue in `pending_reads` forever. Sending a new `ReadIndex` + /// periodically can resolve this. pub fn retry_pending_reads(&mut self, cfg: &Config) { if self.is_leader() || !self.pending_reads.check_needs_retry(cfg) @@ -3716,7 +3950,11 @@ where ); } - pub fn push_pending_read(&mut self, read: ReadIndexRequest, is_leader: bool) { + pub fn push_pending_read( + &mut self, + read: ReadIndexRequest>, + is_leader: bool, + ) { self.pending_reads.push_back(read, is_leader); } @@ -3739,58 +3977,42 @@ where "peer_id" => self.peer.get_id(), "err" => ?e, ); - poll_ctx.raft_metrics.propose.unsafe_read_index += 1; + poll_ctx.raft_metrics.propose.unsafe_read_index.inc(); cmd_resp::bind_error(&mut err_resp, e); - cb.invoke_with_response(err_resp); + cb.report_error(err_resp); self.should_wake_up = true; return false; } let now = monotonic_raw_now(); if self.is_leader() { - match self.inspect_lease() { - // Here combine the new read request with the previous one even if the lease expired is - // ok because in this case, the previous read index must be sent out with a valid - // lease instead of a suspect lease. So there must no pending transfer-leader proposals - // before or after the previous read index, and the lease can be renewed when get - // heartbeat responses. - LeaseState::Valid | LeaseState::Expired => { - // Must use the commit index of `PeerStorage` instead of the commit index - // in raft-rs which may be greater than the former one. - // For more details, see the annotations above `on_leader_commit_idx_changed`. - let commit_index = self.get_store().commit_index(); - if let Some(read) = self.pending_reads.back_mut() { - let max_lease = poll_ctx.cfg.raft_store_max_leader_lease(); - let is_read_index_request = req - .get_requests() - .get(0) - .map(|req| req.has_read_index()) - .unwrap_or_default(); - // A read index request or a read with addition request always needs the response of - // checking memory lock for async commit, so we cannot apply the optimization here - if !is_read_index_request - && read.addition_request.is_none() - && read.propose_time + max_lease > now - { - // A read request proposed in the current lease is found; combine the new - // read request to that previous one, so that no proposing needed. - read.push_command(req, cb, commit_index); - return false; - } - } + let lease_state = self.inspect_lease(); + if can_amend_read::>( + self.pending_reads.back(), + &req, + lease_state, + poll_ctx.cfg.raft_store_max_leader_lease(), + now, + ) { + // Must use the commit index of `PeerStorage` instead of the commit index + // in raft-rs which may be greater than the former one. + // For more details, see the annotations above `on_leader_commit_idx_changed`. + let commit_index = self.get_store().commit_index(); + if let Some(read) = self.pending_reads.back_mut() { + // A read request proposed in the current lease is found; combine the new + // read request to that previous one, so that no proposing needed. + read.push_command(req, cb, commit_index); + return false; } - // If the current lease is suspect, new read requests can't be appended into - // `pending_reads` because if the leader is transferred, the latest read could - // be dirty. - _ => {} } } - // When a replica cannot detect any leader, `MsgReadIndex` will be dropped, which would - // cause a long time waiting for a read response. Then we should return an error directly - // in this situation. if !self.is_leader() && self.leader_id() == INVALID_ID { - poll_ctx.raft_metrics.invalid_proposal.read_index_no_leader += 1; + poll_ctx + .raft_metrics + .invalid_proposal + .read_index_no_leader + .inc(); // The leader may be hibernated, send a message for trying to awaken the leader. if self.bcast_wake_up_time.is_none() || self @@ -3817,11 +4039,11 @@ where } self.should_wake_up = true; cmd_resp::bind_error(&mut err_resp, Error::NotLeader(self.region_id, None)); - cb.invoke_with_response(err_resp); + cb.report_error(err_resp); return false; } - poll_ctx.raft_metrics.propose.read_index += 1; + poll_ctx.raft_metrics.propose.read_index.inc(); self.bcast_wake_up_time = None; let request = req @@ -3833,7 +4055,7 @@ where if dropped && self.is_leader() { // The message gets dropped silently, can't be handled anymore. apply::notify_stale_req(self.term(), cb); - poll_ctx.raft_metrics.propose.dropped_read_index += 1; + poll_ctx.raft_metrics.propose.dropped_read_index.inc(); return false; } @@ -3862,6 +4084,7 @@ where cb: Callback::None, propose_time: Some(now), must_pass_epoch_check: false, + sent: false, }; self.post_propose(poll_ctx, p); } @@ -3877,20 +4100,7 @@ where request: Option<&raft_cmdpb::ReadIndexRequest>, locked: Option<&LockInfo>, ) -> (Uuid, bool) { - let last_pending_read_count = self.raft_group.raft.pending_read_count(); - let last_ready_read_count = self.raft_group.raft.ready_read_count(); - - let id = Uuid::new_v4(); - self.raft_group - .read_index(ReadIndexContext::fields_to_bytes(id, request, locked)); - - let pending_read_count = self.raft_group.raft.pending_read_count(); - let ready_read_count = self.raft_group.raft.ready_read_count(); - ( - id, - pending_read_count == last_pending_read_count - && ready_read_count == last_ready_read_count, - ) + propose_read_index(&mut self.raft_group, request, locked) } /// Returns (minimal matched, minimal committed_index) @@ -3929,8 +4139,9 @@ where "min_matched" => min_m, "min_committed" => min_c, ); - // Reset `min_matched` to `min_committed`, since the raft log at `min_committed` is - // known to be committed in all peers, all of the peers should also have replicated it + // Reset `min_matched` to `min_committed`, since the raft log at `min_committed` + // is known to be committed in all peers, all of the peers should also have + // replicated it min_m = min_c; } Ok((min_m, min_c)) @@ -3946,7 +4157,8 @@ where if self.prepare_merge_fence > 0 { let applied_index = self.get_store().applied_index(); if applied_index >= self.prepare_merge_fence { - // Check passed, clear fence and start proposing pessimistic locks and PrepareMerge. + // Check passed, clear fence and start proposing pessimistic locks and + // PrepareMerge. self.prepare_merge_fence = 0; self.pending_prepare_merge = None; passed_merge_fence = true; @@ -3968,12 +4180,14 @@ where || min_committed == 0 || last_index - min_matched > ctx.cfg.merge_max_log_gap || last_index - min_committed > ctx.cfg.merge_max_log_gap * 2 + || min_matched < self.last_sent_snapshot_idx { return Err(box_err!( - "log gap from matched: {} or committed: {} to last index: {} is too large, skip merge", + "log gap too large, skip merge: matched: {}, committed: {}, last index: {}, last_snapshot: {}", min_matched, min_committed, - last_index + last_index, + self.last_sent_snapshot_idx )); } let mut entry_size = 0; @@ -4023,10 +4237,10 @@ where )); }; - // Record current proposed index. If there are some in-memory pessimistic locks, we should - // wait until applying to the proposed index before proposing pessimistic locks and - // PrepareMerge. Otherwise, if an already proposed command will remove a pessimistic lock, - // we will make some deleted locks appear again. + // Record current proposed index. If there are some in-memory pessimistic locks, + // we should wait until applying to the proposed index before proposing + // pessimistic locks and PrepareMerge. Otherwise, if an already proposed command + // will remove a pessimistic lock, we will make some deleted locks appear again. if !passed_merge_fence { let pessimistic_locks = self.txn_ext.pessimistic_locks.read(); if !pessimistic_locks.is_empty() { @@ -4072,9 +4286,10 @@ where pessimistic_locks.status = LocksStatus::MergingRegion; return Ok(()); } - // The proposed pessimistic locks here will also be carried in CommitMerge. Check the size - // to avoid CommitMerge exceeding the size limit of a raft entry. This check is a inaccurate - // check. We will check the size again accurately later using the protobuf encoding. + // The proposed pessimistic locks here will also be carried in CommitMerge. + // Check the size to avoid CommitMerge exceeding the size limit of a raft entry. + // This check is a inaccurate check. We will check the size again accurately + // later using the protobuf encoding. if pessimistic_locks.memory_size > size_limit { return Err(box_err!( "pessimistic locks size {} exceed size limit {}, skip merging.", @@ -4148,9 +4363,11 @@ where /// Propose normal request to raft /// - /// Returns Ok(Either::Left(index)) means the proposal is proposed successfully and is located on `index` position. - /// Ok(Either::Right(index)) means the proposal is rejected by `CmdEpochChecker` and the `index` is the position of - /// the last conflict admin cmd. + /// Returns Ok(Either::Left(index)) means the proposal is proposed + /// successfully and is located on `index` position. + /// Ok(Either::Right(index)) means the proposal is rejected by + /// `CmdEpochChecker` and the `index` is the position of the last + /// conflict admin cmd. fn propose_normal( &mut self, poll_ctx: &mut PollContext, @@ -4160,6 +4377,7 @@ where // In `pre_propose_raft_command`, it rejects all the requests expect conf-change // if in force leader state. if self.force_leader.is_some() { + poll_ctx.raft_metrics.invalid_proposal.force_leader.inc(); panic!( "{} propose normal in force leader state {:?}", self.tag, self.force_leader @@ -4174,11 +4392,11 @@ where return Err(Error::ProposalInMergingMode(self.region_id)); } - poll_ctx.raft_metrics.propose.normal += 1; + poll_ctx.raft_metrics.propose.normal.inc(); if self.has_applied_to_current_term() { - // Only when applied index's term is equal to current leader's term, the information - // in epoch checker is up to date and can be used to check epoch. + // Only when applied index's term is equal to current leader's term, the + // information in epoch checker is up to date and can be used to check epoch. if let Some(index) = self .cmd_epoch_checker .propose_check_epoch(&req, self.term()) @@ -4186,12 +4404,13 @@ where return Ok(Either::Right(index)); } } else if req.has_admin_request() { - // The admin request is rejected because it may need to update epoch checker which - // introduces an uncertainty and may breaks the correctness of epoch checker. + // The admin request is rejected because it may need to update epoch checker + // which introduces an uncertainty and may breaks the correctness of epoch + // checker. return Err(box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", self.tag, - self.get_store().applied_index_term(), + self.get_store().applied_term(), self.term() )); } @@ -4200,7 +4419,8 @@ where let ctx = match self.pre_propose(poll_ctx, &mut req) { Ok(ctx) => ctx, Err(e) => { - // Skipping PrepareMerge is logged when the PendingPrepareMerge error is generated. + // Skipping PrepareMerge is logged when the PendingPrepareMerge error is + // generated. if !matches!(e, Error::PendingPrepareMerge) { warn!( "skip proposal"; @@ -4215,9 +4435,10 @@ where }; let data = req.write_to_bytes()?; - - // TODO: use local histogram metrics - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data.len() as f64); + poll_ctx + .raft_metrics + .propose_log_size + .observe(data.len() as f64); if data.len() as u64 > poll_ctx.cfg.raft_entry_max_size.0 { error!( @@ -4232,7 +4453,7 @@ where }); } - self.maybe_inject_propose_error(&req)?; + fail_point!("raft_propose", |_| Ok(Either::Right(0))); let propose_index = self.next_proposal_index(); self.raft_group.propose(ctx.to_vec(), data)?; if self.next_proposal_index() == propose_index { @@ -4272,33 +4493,99 @@ where Ok(Either::Left(propose_index)) } - pub fn execute_transfer_leader( + pub fn maybe_reject_transfer_leader_msg( &mut self, ctx: &mut PollContext, - from: u64, + msg: &eraftpb::Message, peer_disk_usage: DiskUsage, - reply_cmd: bool, // whether it is a reply to a TransferLeader command - ) { + ) -> bool { let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); - if pending_snapshot - || from != self.leader_id() + // shouldn't transfer leader to witness peer or non-witness waiting data + if self.is_witness() || self.wait_data + || pending_snapshot + || msg.get_from() != self.leader_id() // Transfer leader to node with disk full will lead to write availablity downback. // But if the current leader is disk full, and send such request, we should allow it, // because it may be a read leader balance request. || (!matches!(ctx.self_disk_usage, DiskUsage::Normal) && - matches!(peer_disk_usage,DiskUsage::Normal)) + matches!(peer_disk_usage, DiskUsage::Normal)) { info!( "reject transferring leader"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), - "from" => from, + "from" => msg.get_from(), "pending_snapshot" => pending_snapshot, "disk_usage" => ?ctx.self_disk_usage, + "is_witness" => self.is_witness(), + "wait_data" => self.wait_data, ); - return; + return true; } + false + } + /// Before ack the transfer leader message sent by the leader. + /// Currently, it only warms up the entry cache in this stage. + /// + /// This return whether the msg should be acked. When cache is warmed up + /// or the warmup operation is timeout, it is true. + pub fn pre_ack_transfer_leader_msg( + &mut self, + ctx: &mut PollContext, + msg: &eraftpb::Message, + ) -> bool { + if !ctx.cfg.warmup_entry_cache_enabled() { + return true; + } + + // The start index of warmup range. It is leader's entry_cache_first_index, + // which in general is equal to the lowest matched index. + let mut low = msg.get_index(); + let last_index = self.get_store().last_index(); + let mut should_ack_now = false; + + // Need not to warm up when the index is 0. + // There are two cases where index can be 0: + // 1. During rolling upgrade, old instances may not support warmup. + // 2. The leader's entry cache is empty. + if low == 0 || low > last_index { + // There is little possibility that the warmup_range_start + // is larger than the last index. Check the test case + // `test_when_warmup_range_start_is_larger_than_last_index` + // for details. + should_ack_now = true; + } else { + if low < self.last_compacted_idx { + low = self.last_compacted_idx + }; + // Check if the entry cache is already warmed up. + if let Some(first_index) = self.get_store().entry_cache_first_index() { + if low >= first_index { + fail_point!("entry_cache_already_warmed_up"); + should_ack_now = true; + } + } + } + + if should_ack_now { + return true; + } + + // Check if the warmup operation is timeout if warmup is already started. + if let Some(state) = self.mut_store().entry_cache_warmup_state_mut() { + // If it is timeout, this peer should ack the message so that + // the leadership transfer process can continue. + state.check_task_timeout(ctx.cfg.max_entry_cache_warmup_duration.0) + } else { + self.mut_store().async_warm_up_entry_cache(low).is_none() + } + } + + pub fn ack_transfer_leader_msg( + &mut self, + reply_cmd: bool, // whether it is a reply to a TransferLeader command + ) { let mut msg = eraftpb::Message::new(); msg.set_from(self.peer_id()); msg.set_to(self.leader_id()); @@ -4311,7 +4598,7 @@ where self.raft_group.raft.msgs.push(msg); } - /// Return true to if the transfer leader request is accepted. + /// Return true if the transfer leader request is accepted. /// /// When transferring leadership begins, leader sends a pre-transfer /// to target follower first to ensures it's ready to become leader. @@ -4319,10 +4606,23 @@ where /// /// 1. pre_transfer_leader on leader: /// Leader will send a MsgTransferLeader to follower. - /// 2. execute_transfer_leader on follower - /// If follower passes all necessary checks, it will reply an - /// ACK with type MsgTransferLeader and its promised persistent index. - /// 3. ready_to_transfer_leader on leader: + /// 2. pre_ack_transfer_leader_msg on follower: + /// If follower passes all necessary checks, it will try to warmup + /// the entry cache. + /// 3. ack_transfer_leader_msg on follower: + /// When the entry cache has been warmed up or the operator is timeout, + /// the follower reply an ACK with type MsgTransferLeader and + /// its promised persistent index. + /// + /// Additional steps when there are remaining pessimistic + /// locks to propose (detected in function on_transfer_leader_msg). + /// 1. Leader firstly proposes pessimistic locks and then proposes a + /// TransferLeader command. + /// 2. ack_transfer_leader_msg on follower again: + /// The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// + /// 4. ready_to_transfer_leader on leader: /// Leader checks if it's appropriate to transfer leadership. If it /// does, it calls raft transfer_leader API to do the remaining work. /// @@ -4333,7 +4633,7 @@ where req: RaftCmdRequest, cb: Callback, ) -> bool { - ctx.raft_metrics.propose.transfer_leader += 1; + ctx.raft_metrics.propose.transfer_leader.inc(); let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); let prs = self.raft_group.raft.prs(); @@ -4369,7 +4669,8 @@ where }; // transfer leader command doesn't need to replicate log and apply, so we - // return immediately. Note that this command may fail, we can view it just as an advice + // return immediately. Note that this command may fail, we can view it just as + // an advice cb.invoke_with_response(make_transfer_leader_response()); transferred @@ -4380,9 +4681,10 @@ where // 2. Removing the leader is not allowed in the configuration; // 3. The conf change makes the raft group not healthy; // 4. The conf change is dropped by raft group internally. - /// Returns Ok(Either::Left(index)) means the proposal is proposed successfully and is located on `index` position. - /// Ok(Either::Right(index)) means the proposal is rejected by `CmdEpochChecker` and the `index` is the position of - /// the last conflict admin cmd. + /// Returns Ok(Either::Left(index)) means the proposal is proposed + /// successfully and is located on `index` position. Ok(Either:: + /// Right(index)) means the proposal is rejected by `CmdEpochChecker` and + /// the `index` is the position of the last conflict admin cmd. fn propose_conf_change( &mut self, ctx: &mut PollContext, @@ -4402,14 +4704,15 @@ where self.tag )); } - // Actually, according to the implementation of conf change in raft-rs, this check must be - // passed if the previous check that `pending_conf_index` should be less than or equal to - // `self.get_store().applied_index()` is passed. - if self.get_store().applied_index_term() != self.term() { + // Actually, according to the implementation of conf change in raft-rs, this + // check must be passed if the previous check that `pending_conf_index` + // should be less than or equal to `self.get_store().applied_index()` is + // passed. + if self.get_store().applied_term() != self.term() { return Err(box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", self.tag, - self.get_store().applied_index_term(), + self.get_store().applied_term(), self.term() )); } @@ -4447,11 +4750,20 @@ where let cc = change_peer.to_confchange(data); let changes = change_peer.get_change_peers(); - self.check_conf_change(ctx, changes.as_ref(), &cc)?; + // Because the group is always woken up when there is log gap, so no need + // to wake it up again when command is aborted by log gap. + util::check_conf_change( + &ctx.cfg, + &self.raft_group, + self.region(), + &self.peer, + changes.as_ref(), + &cc, + self.is_force_leader(), + )?; - ctx.raft_metrics.propose.conf_change += 1; - // TODO: use local histogram metrics - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data_size as f64); + ctx.raft_metrics.propose.conf_change.inc(); + ctx.raft_metrics.propose_log_size.observe(data_size as f64); info!( "propose conf change peer"; "region_id" => self.region_id, @@ -4472,16 +4784,16 @@ where Ok(propose_index) } - fn handle_read( + fn handle_read>( &self, - ctx: &mut PollContext, + reader: &mut E, req: RaftCmdRequest, check_epoch: bool, read_index: Option, ) -> ReadResponse { let region = self.region().clone(); if check_epoch { - if let Err(e) = check_region_epoch(&req, ®ion, true) { + if let Err(e) = check_req_region_epoch(&req, ®ion, true) { debug!("epoch not match"; "region_id" => region.get_id(), "err" => ?e); let mut response = cmd_resp::new_error(e); cmd_resp::bind_term(&mut response, self.term()); @@ -4497,11 +4809,16 @@ where let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); let safe_ts = self.read_progress.safe_ts(); if safe_ts < read_ts { + // Advancing resolved ts may be expensive, only notify if read_ts - safe_ts > + // 200ms. + if TimeStamp::from(read_ts).physical() > TimeStamp::from(safe_ts).physical() + 200 { + self.read_progress.notify_advance_resolved_ts(); + } warn!( "read rejected by safe timestamp"; "safe ts" => safe_ts, "read ts" => read_ts, - "tag" => &self.tag + "tag" => &self.tag, ); let mut response = cmd_resp::new_error(Error::DataIsNotReady { region_id: region.get_id(), @@ -4517,7 +4834,7 @@ where } } - let mut resp = ctx.execute(&req, &Arc::new(region), read_index, None); + let mut resp = reader.execute(&req, &Arc::new(region), read_index, None); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); snap.bucket_meta = self.region_buckets.as_ref().map(|b| b.meta.clone()); @@ -4545,7 +4862,7 @@ where return; } if let Some(ref state) = self.pending_merge_state { - if state.get_commit() == extra_msg.get_premerge_commit() { + if state.get_commit() == extra_msg.get_index() { self.add_want_rollback_merge_peer(peer_id); } } @@ -4586,7 +4903,8 @@ where normal_peers.insert(peer_id); } if let Some(pr) = self.raft_group.raft.prs().get(peer_id) { - // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort func belowing. + // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort + // func belowing. let mut status = 3; if let Some(usg) = usage { status = match usg { @@ -4621,7 +4939,8 @@ where return; } - // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, then try to get a potential quorum. + // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, + // then try to get a potential quorum. next_idxs.sort_by(|x, y| { if x.3 == y.3 { y.1.cmp(&x.1) @@ -4677,8 +4996,8 @@ where self.dangerous_majority_set = has_dangurous_set; - // For the Peer with AlreadFull in potential quorum set, we still need to send logs to it. - // To support incoming configure change. + // For the Peer with AlreadFull in potential quorum set, we still need to send + // logs to it. To support incoming configure change. if quorum_ok { for peer in potential_quorum { if let Some(x) = self.disk_full_peers.peers.get_mut(&peer) { @@ -4699,54 +5018,74 @@ where // Check disk usages for the peer itself and other peers in the raft group. // The return value indicates whether the proposal is allowed or not. - fn check_proposal_normal_with_disk_usage( + fn check_normal_proposal_with_disk_full_opt( &mut self, ctx: &mut PollContext, disk_full_opt: DiskFullOpt, - disk_full_stores: &mut Vec, - maybe_transfer_leader: &mut bool, - ) -> bool { - // check self disk status. - let allowed = match ctx.self_disk_usage { + ) -> Result<()> { + let leader_allowed = match ctx.self_disk_usage { DiskUsage::Normal => true, DiskUsage::AlmostFull => !matches!(disk_full_opt, DiskFullOpt::NotAllowedOnFull), DiskUsage::AlreadyFull => false, }; - - if !allowed { + let mut disk_full_stores = Vec::new(); + if !leader_allowed { disk_full_stores.push(ctx.store.id); - *maybe_transfer_leader = true; - return false; - } - - // If all followers diskusage normal, then allowed. - if self.disk_full_peers.is_empty() { - return true; - } - - for peer in self.get_store().region().get_peers() { - let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); - if self.disk_full_peers.peers.get(&peer_id).is_some() { - disk_full_stores.push(store_id); + // Try to transfer leader to a node with disk usage normal to maintain write + // availability. If majority node is disk full, to transfer leader or not is not + // necessary. Note: Need to exclude learner node. + if !self.disk_full_peers.majority { + let target_peer = self + .get_store() + .region() + .get_peers() + .iter() + .find(|x| { + !self.disk_full_peers.has(x.get_id()) + && x.get_id() != self.peer.get_id() + && !self.down_peer_ids.contains(&x.get_id()) + && !matches!(x.get_role(), PeerRole::Learner) + }) + .cloned(); + if let Some(p) = target_peer { + debug!( + "try to transfer leader because of current leader disk full"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + "target_peer_id" => p.get_id(), + ); + self.pre_transfer_leader(&p); + } + } + } else { + // Check followers. + if self.disk_full_peers.is_empty() { + return Ok(()); + } + if !self.dangerous_majority_set { + if !self.disk_full_peers.majority { + return Ok(()); + } + // Majority peers are in disk full status but the request carries a special + // flag. + if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) + && self.disk_full_peers.peers.values().any(|x| x.1) + { + return Ok(()); + } + } + for peer in self.get_store().region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + if self.disk_full_peers.peers.get(&peer_id).is_some() { + disk_full_stores.push(store_id); + } } } - - // if there are some peers with disk already full status in the majority set, should not allowed. - if self.dangerous_majority_set { - return false; - } - - if !self.disk_full_peers.majority { - return true; - } - - if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) - && self.disk_full_peers.peers.values().any(|x| x.1) - { - // Majority peers are in disk full status but the request carries a special flag. - return true; - } - false + let errmsg = format!( + "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", + disk_full_opt, ctx.self_disk_usage + ); + Err(Error::DiskFull(disk_full_stores, errmsg)) } /// Check if the command will be likely to pass all the check and propose. @@ -4764,7 +5103,8 @@ where } pub fn maybe_gen_approximate_buckets(&self, ctx: &PollContext) { - if ctx.coprocessor_host.cfg.enable_region_bucket && !self.region().get_peers().is_empty() { + if ctx.coprocessor_host.cfg.enable_region_bucket() && !self.region().get_peers().is_empty() + { if let Err(e) = ctx .split_check_scheduler .schedule(SplitCheckTask::ApproximateBuckets(self.region().clone())) @@ -4806,6 +5146,30 @@ where } } } + + pub fn snapshot_recovery_maybe_finish_wait_apply(&mut self, force: bool) { + if let Some(SnapshotRecoveryState::WaitLogApplyToLast { target_index, .. }) = + &self.snapshot_recovery_state + { + if self.raft_group.raft.term != self.raft_group.raft.raft_log.last_term() { + return; + } + + if self.raft_group.raft.raft_log.applied >= *target_index + || force + || self.pending_remove + { + info!("snapshot recovery wait apply finished"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_index" => target_index, + "applied" => self.raft_group.raft.raft_log.applied, + "force" => force, + ); + self.snapshot_recovery_state = None; + } + } + } } #[derive(Default, Debug)] @@ -4876,7 +5240,7 @@ where let res = self.raft_group.raft.check_group_commit_consistent(); if Some(true) != res { let mut buffer: SmallVec<[(u64, u64, u64); 5]> = SmallVec::new(); - if self.get_store().applied_index_term() >= self.term() { + if self.get_store().applied_term() >= self.term() { let progress = self.raft_group.raft.prs(); for (id, p) in progress.iter() { if !progress.conf().voters().contains(*id) { @@ -4922,6 +5286,7 @@ where approximate_size: self.approximate_size, approximate_keys: self.approximate_keys, replication_status: self.region_replication_status(), + wait_data_peers: self.wait_data_peers.clone(), }); if let Err(e) = ctx.pd_scheduler.schedule(task) { error!( @@ -5007,13 +5372,14 @@ where } // There could be two cases: - // 1. Target peer already exists but has not established communication with leader yet - // 2. Target peer is added newly due to member change or region split, but it's not - // created yet - // For both cases the region start key and end key are attached in RequestVote and - // Heartbeat message for the store of that peer to check whether to create a new peer - // when receiving these messages, or just to wait for a pending region split to perform - // later. + // - Target peer already exists but has not established communication with + // leader yet + // - Target peer is added newly due to member change or region split, but it's + // not created yet + // For both cases the region start key and end key are attached in RequestVote + // and Heartbeat message for the store of that peer to check whether to create a + // new peer when receiving these messages, or just to wait for a pending region + // split to perform later. if self.get_store().is_initialized() && is_initial_msg(&msg) { let region = self.region(); send_msg.set_start_key(region.get_start_key().to_vec()); @@ -5092,7 +5458,7 @@ where }; let mut extra_msg = ExtraMessage::default(); extra_msg.set_type(ExtraMessageType::MsgWantRollbackMerge); - extra_msg.set_premerge_commit(premerge_commit); + extra_msg.set_index(premerge_commit); self.send_extra_message(extra_msg, &mut ctx.trans, &to_peer); } @@ -5181,41 +5547,8 @@ where self.raft_group.raft.r.max_msg_size = ctx.cfg.raft_max_size_per_msg.0; } - fn maybe_inject_propose_error( - &self, - #[allow(unused_variables)] req: &RaftCmdRequest, - ) -> Result<()> { - // The return value format is {req_type}:{store_id} - // Request matching the format will fail to be proposed. - // Empty `req_type` means matching all kinds of requests. - // ":{store_id}" can be omitted, meaning matching all stores. - fail_point!("raft_propose", |r| { - r.map_or(Ok(()), |s| { - let mut parts = s.splitn(2, ':'); - let cmd_type = parts.next().unwrap(); - let store_id = parts.next().map(|s| s.parse::().unwrap()); - if let Some(store_id) = store_id { - if store_id != self.peer.get_store_id() { - return Ok(()); - } - } - let admin_type = req.get_admin_request().get_cmd_type(); - let match_type = cmd_type.is_empty() - || (cmd_type == "prepare_merge" && admin_type == AdminCmdType::PrepareMerge) - || (cmd_type == "transfer_leader" - && admin_type == AdminCmdType::TransferLeader); - // More matching rules can be added here. - if match_type { - Err(box_err!("injected error")) - } else { - Ok(()) - } - }) - }); - Ok(()) - } - - /// Update states of the peer which can be changed in the previous raft tick. + /// Update states of the peer which can be changed in the previous raft + /// tick. pub fn post_raft_group_tick(&mut self) { self.lead_transferee = self.raft_group.raft.lead_transferee.unwrap_or_default(); } @@ -5282,6 +5615,10 @@ pub trait RequestInspector { return Ok(RequestPolicy::ProposeNormal); } + fail_point!("perform_read_index", |_| Ok(RequestPolicy::ReadIndex)); + + fail_point!("perform_read_local", |_| Ok(RequestPolicy::ReadLocal)); + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { return Ok(RequestPolicy::StaleRead); @@ -5291,8 +5628,8 @@ pub trait RequestInspector { return Ok(RequestPolicy::ReadIndex); } - // If applied index's term is differ from current raft's term, leader transfer - // must happened, if read locally, we may read old value. + // If applied index's term differs from current raft's term, leader + // transfer must happened, if read locally, we may read old value. if !self.has_applied_to_current_term() { return Ok(RequestPolicy::ReadIndex); } @@ -5315,7 +5652,7 @@ where ER: RaftEngine, { fn has_applied_to_current_term(&mut self) -> bool { - self.get_store().applied_index_term() == self.term() + self.get_store().applied_term() == self.term() } fn inspect_lease(&mut self) -> LeaseState { @@ -5338,16 +5675,18 @@ where } } -impl ReadExecutor for PollContext +impl ReadExecutor for PollContext where EK: KvEngine, ER: RaftEngine, { - fn get_engine(&self) -> &EK { + type Tablet = EK; + + fn get_tablet(&mut self) -> &EK { &self.engines.kv } - fn get_snapshot(&mut self, _: Option) -> Arc { + fn get_snapshot(&mut self, _: &Option>) -> Arc { Arc::new(self.engines.kv.snapshot()) } } @@ -5364,7 +5703,7 @@ fn get_transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderReques Some(req.get_transfer_leader()) } -fn get_sync_log_from_request(msg: &RaftCmdRequest) -> bool { +pub fn get_sync_log_from_request(msg: &RaftCmdRequest) -> bool { if msg.has_admin_request() { let req = msg.get_admin_request(); return matches!( @@ -5376,6 +5715,8 @@ fn get_sync_log_from_request(msg: &RaftCmdRequest) -> bool { | AdminCmdType::PrepareMerge | AdminCmdType::CommitMerge | AdminCmdType::RollbackMerge + | AdminCmdType::PrepareFlashback + | AdminCmdType::FinishFlashback ); } @@ -5402,10 +5743,11 @@ fn is_request_urgent(req: &RaftCmdRequest) -> bool { | AdminCmdType::PrepareMerge | AdminCmdType::CommitMerge | AdminCmdType::RollbackMerge + | AdminCmdType::BatchSwitchWitness ) } -fn make_transfer_leader_response() -> RaftCmdResponse { +pub fn make_transfer_leader_response() -> RaftCmdResponse { let mut response = AdminResponse::default(); response.set_cmd_type(AdminCmdType::TransferLeader); response.set_transfer_leader(TransferLeaderResponse::default()); @@ -5414,20 +5756,10 @@ fn make_transfer_leader_response() -> RaftCmdResponse { resp } -// The Raft message context for a MsgTransferLeader if it is a reply of a TransferLeader command. +// The Raft message context for a MsgTransferLeader if it is a reply of a +// TransferLeader command. pub const TRANSFER_LEADER_COMMAND_REPLY_CTX: &[u8] = &[1]; -/// A poor version of `Peer` to avoid port generic variables everywhere. -pub trait AbstractPeer { - fn meta_peer(&self) -> &metapb::Peer; - fn group_state(&self) -> GroupState; - fn region(&self) -> &metapb::Region; - fn apply_state(&self) -> &RaftApplyState; - fn raft_status(&self) -> raft::Status<'_>; - fn raft_commit_index(&self) -> u64; - fn pending_merge_state(&self) -> Option<&MergeState>; -} - mod memtrace { use std::mem; @@ -5483,6 +5815,8 @@ mod tests { AdminCmdType::TransferLeader, AdminCmdType::ComputeHash, AdminCmdType::VerifyHash, + AdminCmdType::BatchSwitchWitness, + AdminCmdType::UpdateGcPeer, ]; for tp in AdminCmdType::values() { let mut msg = RaftCmdRequest::default(); @@ -5508,6 +5842,7 @@ mod tests { AdminCmdType::PrepareMerge, AdminCmdType::CommitMerge, AdminCmdType::RollbackMerge, + AdminCmdType::BatchSwitchWitness, ]; for tp in AdminCmdType::values() { let mut req = RaftCmdRequest::default(); @@ -5665,14 +6000,13 @@ mod tests { applied_to_index_term: true, lease_state: LeaseState::Valid, }; - assert!(inspector.inspect(&req).is_err()); + inspector.inspect(&req).unwrap_err(); } } #[test] fn test_propose_queue_find_proposal() { - let mut pq: ProposalQueue = - ProposalQueue::new("tag".to_owned()); + let mut pq: ProposalQueue> = ProposalQueue::new(1, 2); let gen_term = |index: u64| (index / 10) + 1; let push_proposal = |pq: &mut ProposalQueue<_>, index: u64| { pq.push(Proposal { @@ -5682,6 +6016,7 @@ mod tests { cb: Callback::write(Box::new(|_| {})), propose_time: Some(u64_to_timespec(index)), must_pass_epoch_check: false, + sent: false, }); }; for index in 1..=100 { @@ -5734,8 +6069,7 @@ mod tests { fn must_not_call() -> ExtCallback { Box::new(move || unreachable!()) } - let mut pq: ProposalQueue = - ProposalQueue::new("tag".to_owned()); + let mut pq: ProposalQueue> = ProposalQueue::new(1, 2); // (1, 4) and (1, 5) is not committed let entries = vec![(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (2, 6), (2, 7)]; @@ -5756,6 +6090,7 @@ mod tests { is_conf_change: false, propose_time: None, must_pass_epoch_check: false, + sent: false, }); } for (index, term) in entries { diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index a6208b09f9e..d89eafc3a46 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -2,23 +2,18 @@ // #[PerformanceCriticalPath] use std::{ - cell::{Cell, RefCell}, - cmp, - collections::VecDeque, - error, mem, - ops::Range, + cell::RefCell, + error, + ops::{Deref, DerefMut}, sync::{ atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, mpsc::{self, Receiver, TryRecvError}, - Arc, Mutex, + Arc, }, u64, }; -use collections::HashMap; -use engine_traits::{ - Engines, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, CF_RAFT, RAFT_LOG_MULTI_GET_CNT, -}; +use engine_traits::{Engines, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, CF_RAFT}; use fail::fail_point; use into_other::into_other; use keys::{self, enc_end_key, enc_start_key}; @@ -32,20 +27,24 @@ use protobuf::Message; use raft::{ self, eraftpb::{self, ConfState, Entry, HardState, Snapshot}, - util::limit_size, Error as RaftError, GetEntriesContext, RaftState, Ready, Storage, StorageError, }; -use tikv_alloc::trace::TraceEvent; use tikv_util::{ - box_err, box_try, debug, defer, error, info, time::Instant, warn, worker::Scheduler, + box_err, box_try, debug, defer, error, info, + store::find_peer_by_id, + time::{Instant, UnixSecs}, + warn, + worker::Scheduler, }; -use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager, SnapshotStatistics}; +use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager}; use crate::{ - bytes_capacity, store::{ - async_io::write::WriteTask, fsm::GenSnapTask, memory::*, peer::PersistSnapshotResult, util, - worker::RaftlogFetchTask, + async_io::{read::ReadTask, write::WriteTask}, + entry_storage::EntryStorage, + fsm::GenSnapTask, + peer::PersistSnapshotResult, + util, }, Error, Result, }; @@ -55,17 +54,12 @@ use crate::{ pub const RAFT_INIT_LOG_TERM: u64 = 5; pub const RAFT_INIT_LOG_INDEX: u64 = 5; const MAX_SNAP_TRY_CNT: usize = 5; -const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; - -pub const MAX_INIT_ENTRY_COUNT: usize = 1024; /// The initial region epoch version. pub const INIT_EPOCH_VER: u64 = 1; /// The initial region epoch conf_version. pub const INIT_EPOCH_CONF_VER: u64 = 1; -const SHRINK_CACHE_CAPACITY: usize = 64; - pub const JOB_STATUS_PENDING: usize = 0; pub const JOB_STATUS_RUNNING: usize = 1; pub const JOB_STATUS_CANCELLING: usize = 2; @@ -73,8 +67,6 @@ pub const JOB_STATUS_CANCELLED: usize = 3; pub const JOB_STATUS_FINISHED: usize = 4; pub const JOB_STATUS_FAILED: usize = 5; -const ENTRY_MEM_SIZE: usize = mem::size_of::(); - /// Possible status returned by `check_applying_snap`. #[derive(Debug, Clone, Copy, PartialEq)] pub enum CheckApplyingSnapStatus { @@ -112,312 +104,7 @@ impl PartialEq for SnapState { } } -#[inline] -pub fn first_index(state: &RaftApplyState) -> u64 { - state.get_truncated_state().get_index() + 1 -} - -#[inline] -pub fn last_index(state: &RaftLocalState) -> u64 { - state.get_last_index() -} - -struct EntryCache { - // The last index of persisted entry. - // It should be equal to `RaftLog::persisted`. - persisted: u64, - cache: VecDeque, - trace: VecDeque, - hit: Cell, - miss: Cell, - #[cfg(test)] - size_change_cb: Option>, -} - -impl EntryCache { - fn first_index(&self) -> Option { - self.cache.front().map(|e| e.get_index()) - } - - fn fetch_entries_to( - &self, - begin: u64, - end: u64, - mut fetched_size: u64, - max_size: u64, - ents: &mut Vec, - ) { - if begin >= end { - return; - } - assert!(!self.cache.is_empty()); - let cache_low = self.cache.front().unwrap().get_index(); - let start_idx = begin.checked_sub(cache_low).unwrap() as usize; - let limit_idx = end.checked_sub(cache_low).unwrap() as usize; - - let mut end_idx = start_idx; - self.cache - .iter() - .skip(start_idx) - .take_while(|e| { - let cur_idx = end_idx as u64 + cache_low; - assert_eq!(e.get_index(), cur_idx); - let m = u64::from(e.compute_size()); - fetched_size += m; - if fetched_size == m { - end_idx += 1; - fetched_size <= max_size && end_idx < limit_idx - } else if fetched_size <= max_size { - end_idx += 1; - end_idx < limit_idx - } else { - false - } - }) - .count(); - // Cache either is empty or contains latest log. Hence we don't need to fetch log - // from rocksdb anymore. - assert!(end_idx == limit_idx || fetched_size > max_size); - let (first, second) = tikv_util::slices_in_range(&self.cache, start_idx, end_idx); - ents.extend_from_slice(first); - ents.extend_from_slice(second); - } - - fn append(&mut self, tag: &str, entries: &[Entry]) { - if !entries.is_empty() { - let mut mem_size_change = 0; - let old_capacity = self.cache.capacity(); - mem_size_change += self.append_impl(tag, entries); - let new_capacity = self.cache.capacity(); - mem_size_change += Self::get_cache_vec_mem_size_change(new_capacity, old_capacity); - mem_size_change += self.shrink_if_necessary(); - self.flush_mem_size_change(mem_size_change); - } - } - - fn append_impl(&mut self, tag: &str, entries: &[Entry]) -> i64 { - let mut mem_size_change = 0; - - if let Some(cache_last_index) = self.cache.back().map(|e| e.get_index()) { - let first_index = entries[0].get_index(); - if cache_last_index >= first_index { - let cache_len = self.cache.len(); - let truncate_to = cache_len - .checked_sub((cache_last_index - first_index + 1) as usize) - .unwrap_or_default(); - let trunc_to_idx = self.cache[truncate_to].index; - for e in self.cache.drain(truncate_to..) { - mem_size_change -= - (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; - } - if let Some(cached) = self.trace.back() { - // Only committed entries can be traced, and only uncommitted entries - // can be truncated. So there won't be any overlaps. - let cached_last = cached.range.end - 1; - assert!(cached_last < trunc_to_idx); - } - } else if cache_last_index + 1 < first_index { - panic!( - "{} unexpected hole: {} < {}", - tag, cache_last_index, first_index - ); - } - } - - for e in entries { - self.cache.push_back(e.to_owned()); - mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; - } - // In the past, the entry cache will be truncated if its size exceeds a certain number. - // However, after introducing async write io, the entry must stay in cache if it's not - // persisted to raft db because the raft-rs may need to read entries.(e.g. leader sends - // MsgAppend to followers) - - mem_size_change - } - - pub fn entry(&self, idx: u64) -> Option<&Entry> { - let cache_low = self.cache.front()?.get_index(); - if idx >= cache_low { - Some(&self.cache[(idx - cache_low) as usize]) - } else { - None - } - } - - /// Compact all entries whose indexes are less than `idx`. - pub fn compact_to(&mut self, mut idx: u64) -> u64 { - if idx > self.persisted + 1 { - // Only the persisted entries can be compacted - idx = self.persisted + 1; - } - - let mut mem_size_change = 0; - - // Clean cached entries which have been already sent to apply threads. For example, - // if entries [1, 10), [10, 20), [20, 30) are sent to apply threads and `compact_to(15)` - // is called, only [20, 30) will still be kept in cache. - let old_trace_cap = self.trace.capacity(); - while let Some(cached_entries) = self.trace.pop_front() { - if cached_entries.range.start >= idx { - self.trace.push_front(cached_entries); - let trace_len = self.trace.len(); - let trace_cap = self.trace.capacity(); - if trace_len < SHRINK_CACHE_CAPACITY && trace_cap > SHRINK_CACHE_CAPACITY { - self.trace.shrink_to(SHRINK_CACHE_CAPACITY); - } - break; - } - let (_, dangle_size) = cached_entries.take_entries(); - mem_size_change -= dangle_size as i64; - idx = cmp::max(cached_entries.range.end, idx); - } - let new_trace_cap = self.trace.capacity(); - mem_size_change += Self::get_trace_vec_mem_size_change(new_trace_cap, old_trace_cap); - - let cache_first_idx = self.first_index().unwrap_or(u64::MAX); - if cache_first_idx >= idx { - self.flush_mem_size_change(mem_size_change); - assert!(mem_size_change <= 0); - return -mem_size_change as u64; - } - - let cache_last_idx = self.cache.back().unwrap().get_index(); - // Use `cache_last_idx + 1` to make sure cache can be cleared completely if necessary. - let compact_to = (cmp::min(cache_last_idx + 1, idx) - cache_first_idx) as usize; - for e in self.cache.drain(..compact_to) { - mem_size_change -= (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64 - } - - mem_size_change += self.shrink_if_necessary(); - self.flush_mem_size_change(mem_size_change); - assert!(mem_size_change <= 0); - -mem_size_change as u64 - } - - fn get_total_mem_size(&self) -> i64 { - let data_size: i64 = self - .cache - .iter() - .map(|e| (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64) - .sum(); - let cache_vec_size = Self::get_cache_vec_mem_size_change(self.cache.capacity(), 0); - let trace_vec_size = Self::get_trace_vec_mem_size_change(self.trace.capacity(), 0); - data_size + cache_vec_size + trace_vec_size - } - - fn get_cache_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { - ENTRY_MEM_SIZE as i64 * (new_capacity as i64 - old_capacity as i64) - } - - fn get_trace_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { - mem::size_of::() as i64 * (new_capacity as i64 - old_capacity as i64) - } - - fn flush_mem_size_change(&self, mem_size_change: i64) { - #[cfg(test)] - if let Some(size_change_cb) = self.size_change_cb.as_ref() { - size_change_cb(mem_size_change); - } - let event = if mem_size_change > 0 { - TraceEvent::Add(mem_size_change as usize) - } else { - TraceEvent::Sub(-mem_size_change as usize) - }; - MEMTRACE_ENTRY_CACHE.trace(event); - RAFT_ENTRIES_CACHES_GAUGE.add(mem_size_change); - } - - fn flush_stats(&self) { - let hit = self.hit.replace(0); - RAFT_ENTRY_FETCHES.hit.inc_by(hit); - let miss = self.miss.replace(0); - RAFT_ENTRY_FETCHES.miss.inc_by(miss); - } - - #[inline] - fn is_empty(&self) -> bool { - self.cache.is_empty() - } - - fn trace_cached_entries(&mut self, entries: CachedEntries) { - let dangle_size = { - let mut guard = entries.entries.lock().unwrap(); - - let last_idx = guard.0.last().map(|e| e.index).unwrap(); - let cache_front = match self.cache.front().map(|e| e.index) { - Some(i) => i, - None => u64::MAX, - }; - - let dangle_range = if last_idx < cache_front { - // All entries are not in entry cache. - 0..guard.0.len() - } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { - // Some entries are in entry cache. - 0..i - } else { - // All entries are in entry cache. - 0..0 - }; - - let mut size = 0; - for e in &guard.0[dangle_range] { - size += bytes_capacity(&e.data) + bytes_capacity(&e.context); - } - guard.1 = size; - size - }; - - let old_capacity = self.trace.capacity(); - self.trace.push_back(entries); - let new_capacity = self.trace.capacity(); - let diff = Self::get_trace_vec_mem_size_change(new_capacity, old_capacity); - - self.flush_mem_size_change(diff + dangle_size as i64); - } - - fn shrink_if_necessary(&mut self) -> i64 { - if self.cache.len() < SHRINK_CACHE_CAPACITY && self.cache.capacity() > SHRINK_CACHE_CAPACITY - { - let old_capacity = self.cache.capacity(); - self.cache.shrink_to_fit(); - let new_capacity = self.cache.capacity(); - return Self::get_cache_vec_mem_size_change(new_capacity, old_capacity); - } - 0 - } - - fn update_persisted(&mut self, persisted: u64) { - self.persisted = persisted; - } -} - -impl Default for EntryCache { - fn default() -> Self { - let entry_cache = EntryCache { - persisted: 0, - cache: Default::default(), - trace: Default::default(), - hit: Cell::new(0), - miss: Cell::new(0), - #[cfg(test)] - size_change_cb: None, - }; - entry_cache.flush_mem_size_change(entry_cache.get_total_mem_size()); - entry_cache - } -} - -impl Drop for EntryCache { - fn drop(&mut self) { - let mem_size_change = self.get_total_mem_size(); - self.flush_mem_size_change(-mem_size_change); - self.flush_stats(); - } -} - -fn storage_error(error: E) -> raft::Error +pub fn storage_error(error: E) -> raft::Error where E: Into>, { @@ -430,18 +117,22 @@ impl From for RaftError { } } +#[derive(PartialEq, Debug)] +pub struct HandleSnapshotResult { + pub msgs: Vec, + pub snap_region: metapb::Region, + /// The regions whose range are overlapped with this region + pub destroy_regions: Vec, + /// The first index before applying the snapshot. + pub last_first_index: u64, + pub for_witness: bool, +} + #[derive(PartialEq, Debug)] pub enum HandleReadyResult { - SendIOTask, - Snapshot { - msgs: Vec, - snap_region: metapb::Region, - /// The regions whose range are overlapped with this region - destroy_regions: Vec, - /// The first index before applying the snapshot. - last_first_index: u64, - }, - NoIOTask, + SendIoTask, + Snapshot(Box), // use boxing to reduce total size of the enum + NoIoTask, } pub fn recover_from_applying_state( @@ -464,48 +155,23 @@ pub fn recover_from_applying_state( let raft_state = box_try!(engines.raft.get_raft_state(region_id)).unwrap_or_default(); - // if we recv append log when applying snapshot, last_index in raft_local_state will - // larger than snapshot_index. since raft_local_state is written to raft engine, and - // raft write_batch is written after kv write_batch, raft_local_state may wrong if - // restart happen between the two write. so we copy raft_local_state to kv engine - // (snapshot_raft_state), and set snapshot_raft_state.last_index = snapshot_index. - // after restart, we need check last_index. - if last_index(&snapshot_raft_state) > last_index(&raft_state) { + // since raft_local_state is written to raft engine, and + // raft write_batch is written after kv write_batch. raft_local_state may wrong + // if restart happen between the two write. so we copy raft_local_state to + // kv engine (snapshot_raft_state), and set + // snapshot_raft_state.hard_state.commit = snapshot_index. after restart, we + // need check commit. + if snapshot_raft_state.get_hard_state().get_commit() > raft_state.get_hard_state().get_commit() + { // There is a gap between existing raft logs and snapshot. Clean them up. engines .raft - .clean(region_id, 0 /*first_index*/, &raft_state, raft_wb)?; + .clean(region_id, 0 /* first_index */, &raft_state, raft_wb)?; raft_wb.put_raft_state(region_id, &snapshot_raft_state)?; } Ok(()) } -fn init_applied_index_term( - engines: &Engines, - region: &Region, - apply_state: &RaftApplyState, -) -> Result { - if apply_state.applied_index == RAFT_INIT_LOG_INDEX { - return Ok(RAFT_INIT_LOG_TERM); - } - let truncated_state = apply_state.get_truncated_state(); - if apply_state.applied_index == truncated_state.get_index() { - return Ok(truncated_state.get_term()); - } - - match engines - .raft - .get_entry(region.get_id(), apply_state.applied_index)? - { - Some(e) => Ok(e.term), - None => Err(box_err!( - "[region {}] entry at apply index {} doesn't exist, may lose data.", - region.get_id(), - apply_state.applied_index - )), - } -} - fn init_raft_state( engines: &Engines, region: &Region, @@ -520,7 +186,9 @@ fn init_raft_state( raft_state.last_index = RAFT_INIT_LOG_INDEX; raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - engines.raft.put_raft_state(region.get_id(), &raft_state)?; + let mut lb = engines.raft.log_batch(0); + lb.put_raft_state(region.get_id(), &raft_state)?; + engines.raft.consume(&mut lb, true)?; } Ok(raft_state) } @@ -549,91 +217,6 @@ fn init_apply_state( ) } -fn init_last_term( - engines: &Engines, - region: &Region, - raft_state: &RaftLocalState, - apply_state: &RaftApplyState, -) -> Result { - let last_idx = raft_state.get_last_index(); - if last_idx == 0 { - return Ok(0); - } else if last_idx == RAFT_INIT_LOG_INDEX { - return Ok(RAFT_INIT_LOG_TERM); - } else if last_idx == apply_state.get_truncated_state().get_index() { - return Ok(apply_state.get_truncated_state().get_term()); - } else { - assert!(last_idx > RAFT_INIT_LOG_INDEX); - } - let entry = engines.raft.get_entry(region.get_id(), last_idx)?; - match entry { - None => Err(box_err!( - "[region {}] entry at {} doesn't exist, may lose data.", - region.get_id(), - last_idx - )), - Some(e) => Ok(e.get_term()), - } -} - -fn validate_states( - region_id: u64, - engines: &Engines, - raft_state: &mut RaftLocalState, - apply_state: &RaftApplyState, -) -> Result<()> { - let last_index = raft_state.get_last_index(); - let mut commit_index = raft_state.get_hard_state().get_commit(); - let recorded_commit_index = apply_state.get_commit_index(); - let state_str = || -> String { - format!( - "region {}, raft state {:?}, apply state {:?}", - region_id, raft_state, apply_state - ) - }; - // The commit index of raft state may be less than the recorded commit index. - // If so, forward the commit index. - if commit_index < recorded_commit_index { - let entry = engines.raft.get_entry(region_id, recorded_commit_index)?; - if entry.map_or(true, |e| e.get_term() != apply_state.get_commit_term()) { - return Err(box_err!( - "log at recorded commit index [{}] {} doesn't exist, may lose data, {}", - apply_state.get_commit_term(), - recorded_commit_index, - state_str() - )); - } - info!("updating commit index"; "region_id" => region_id, "old" => commit_index, "new" => recorded_commit_index); - commit_index = recorded_commit_index; - } - // Invariant: applied index <= max(commit index, recorded commit index) - if apply_state.get_applied_index() > commit_index { - return Err(box_err!( - "applied index > max(commit index, recorded commit index), {}", - state_str() - )); - } - // Invariant: max(commit index, recorded commit index) <= last index - if commit_index > last_index { - return Err(box_err!( - "max(commit index, recorded commit index) > last index, {}", - state_str() - )); - } - // Since the entries must be persisted before applying, the term of raft state should also - // be persisted. So it should be greater than the commit term of apply state. - if raft_state.get_hard_state().get_term() < apply_state.get_commit_term() { - return Err(box_err!( - "term of raft state < commit term of apply state, {}", - state_str() - )); - } - - raft_state.mut_hard_state().set_commit(commit_index); - - Ok(()) -} - pub struct PeerStorage where EK: KvEngine, @@ -641,73 +224,32 @@ where pub engines: Engines, peer_id: u64, + peer: Option, // when uninitialized the peer info is unknown. region: metapb::Region, - raft_state: RaftLocalState, - apply_state: RaftApplyState, - applied_index_term: u64, - last_term: u64, snap_state: RefCell, gen_snap_task: RefCell>, region_scheduler: Scheduler>, snap_tried_cnt: RefCell, - cache: EntryCache, - - raftlog_fetch_scheduler: Scheduler, - raftlog_fetch_stats: AsyncFetchStats, - async_fetch_results: RefCell>, + entry_storage: EntryStorage, pub tag: String, } -#[derive(Debug, PartialEq)] -pub enum RaftlogFetchState { - Fetching, - Fetched(Box), -} - -#[derive(Debug, PartialEq)] -pub struct RaftlogFetchResult { - pub ents: raft::Result>, - // because entries may be empty, so store the original low index that the task issued - pub low: u64, - // the original max size that the task issued - pub max_size: u64, - // if the ents hit max_size - pub hit_size_limit: bool, - // the times that async fetch have already tried - pub tried_cnt: usize, - // the term when the task issued - pub term: u64, -} +impl Deref for PeerStorage { + type Target = EntryStorage; -#[derive(Default)] -struct AsyncFetchStats { - async_fetch: Cell, - sync_fetch: Cell, - fallback_fetch: Cell, - fetch_invalid: Cell, - fetch_unused: Cell, + #[inline] + fn deref(&self) -> &Self::Target { + &self.entry_storage + } } -impl AsyncFetchStats { - fn flush_stats(&mut self) { - RAFT_ENTRY_FETCHES - .async_fetch - .inc_by(self.async_fetch.replace(0)); - RAFT_ENTRY_FETCHES - .sync_fetch - .inc_by(self.sync_fetch.replace(0)); - RAFT_ENTRY_FETCHES - .fallback_fetch - .inc_by(self.fallback_fetch.replace(0)); - RAFT_ENTRY_FETCHES - .fetch_invalid - .inc_by(self.fetch_invalid.replace(0)); - RAFT_ENTRY_FETCHES - .fetch_unused - .inc_by(self.fetch_unused.replace(0)); +impl DerefMut for PeerStorage { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.entry_storage } } @@ -728,19 +270,20 @@ where context: GetEntriesContext, ) -> raft::Result> { let max_size = max_size.into(); - self.entries(low, high, max_size.unwrap_or(u64::MAX), context) + self.entry_storage + .entries(low, high, max_size.unwrap_or(u64::MAX), context) } fn term(&self, idx: u64) -> raft::Result { - self.term(idx) + self.entry_storage.term(idx) } fn first_index(&self) -> raft::Result { - Ok(self.first_index()) + Ok(self.entry_storage.first_index()) } fn last_index(&self) -> raft::Result { - Ok(self.last_index()) + Ok(self.entry_storage.last_index()) } fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { @@ -757,7 +300,7 @@ where engines: Engines, region: &metapb::Region, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, peer_id: u64, tag: String, ) -> Result> { @@ -767,31 +310,29 @@ where "peer_id" => peer_id, "path" => ?engines.kv.path(), ); - let mut raft_state = init_raft_state(&engines, region)?; + let raft_state = init_raft_state(&engines, region)?; let apply_state = init_apply_state(&engines, region)?; - if let Err(e) = validate_states(region.get_id(), &engines, &mut raft_state, &apply_state) { - return Err(box_err!("{} validate state fail: {:?}", tag, e)); - } - let last_term = init_last_term(&engines, region, &raft_state, &apply_state)?; - let applied_index_term = init_applied_index_term(&engines, region, &apply_state)?; + + let entry_storage = EntryStorage::new( + peer_id, + engines.raft.clone(), + raft_state, + apply_state, + region, + raftlog_fetch_scheduler, + )?; Ok(PeerStorage { engines, peer_id, + peer: find_peer_by_id(region, peer_id).cloned(), region: region.clone(), - raft_state, - apply_state, snap_state: RefCell::new(SnapState::Relax), gen_snap_task: RefCell::new(None), region_scheduler, - raftlog_fetch_scheduler, snap_tried_cnt: RefCell::new(0), tag, - applied_index_term, - last_term, - cache: EntryCache::default(), - async_fetch_results: RefCell::new(HashMap::default()), - raftlog_fetch_stats: AsyncFetchStats::default(), + entry_storage, }) } @@ -800,14 +341,14 @@ where } pub fn initial_state(&self) -> raft::Result { - let hard_state = self.raft_state.get_hard_state().clone(); + let hard_state = self.raft_state().get_hard_state().clone(); if hard_state == HardState::default() { assert!( !self.is_initialized(), "peer for region {:?} is initialized but local state {:?} has empty hard \ state", self.region, - self.raft_state + self.raft_state() ); return Ok(RaftState::new(hard_state, ConfState::default())); @@ -818,348 +359,6 @@ where )) } - fn check_range(&self, low: u64, high: u64) -> raft::Result<()> { - if low > high { - return Err(storage_error(format!( - "low: {} is greater that high: {}", - low, high - ))); - } else if low <= self.truncated_index() { - return Err(RaftError::Store(StorageError::Compacted)); - } else if high > self.last_index() + 1 { - return Err(storage_error(format!( - "entries' high {} is out of bound lastindex {}", - high, - self.last_index() - ))); - } - Ok(()) - } - - pub fn clean_async_fetch_res(&mut self, low: u64) { - self.async_fetch_results.borrow_mut().remove(&low); - } - - // Update the async fetch result. - // None indicates cleanning the fetched result. - pub fn update_async_fetch_res(&mut self, low: u64, res: Option>) { - // If it's in fetching, don't clean the async fetch result. - if self.async_fetch_results.borrow().get(&low) == Some(&RaftlogFetchState::Fetching) - && res.is_none() - { - return; - } - - match res { - Some(res) => { - if let Some(RaftlogFetchState::Fetched(prev)) = self - .async_fetch_results - .borrow_mut() - .insert(low, RaftlogFetchState::Fetched(res)) - { - info!( - "unconsumed async fetch res"; - "region_id" => self.region.get_id(), - "peer_id" => self.peer_id, - "res" => ?prev, - "low" => low, - ); - } - } - None => { - let prev = self.async_fetch_results.borrow_mut().remove(&low); - if prev.is_some() { - self.raftlog_fetch_stats.fetch_unused.update(|m| m + 1); - } - } - } - } - - fn async_fetch( - &self, - region_id: u64, - low: u64, - high: u64, - max_size: u64, - context: GetEntriesContext, - buf: &mut Vec, - ) -> raft::Result { - if let Some(RaftlogFetchState::Fetching) = self.async_fetch_results.borrow().get(&low) { - // already an async fetch in flight - return Err(raft::Error::Store( - raft::StorageError::LogTemporarilyUnavailable, - )); - } - - let tried_cnt = if let Some(RaftlogFetchState::Fetched(res)) = - self.async_fetch_results.borrow_mut().remove(&low) - { - assert_eq!(res.low, low); - let mut ents = res.ents?; - let first = ents.first().map(|e| e.index).unwrap(); - assert_eq!(first, res.low); - let last = ents.last().map(|e| e.index).unwrap(); - - if last + 1 >= high { - // async fetch res covers [low, high) - ents.truncate((high - first) as usize); - assert_eq!(ents.last().map(|e| e.index).unwrap(), high - 1); - if max_size < res.max_size { - limit_size(&mut ents, Some(max_size)); - } - let count = ents.len(); - buf.append(&mut ents); - fail_point!("on_async_fetch_return"); - return Ok(count); - } else if res.hit_size_limit && max_size <= res.max_size { - // async fetch res doesn't cover [low, high) due to hit size limit - if max_size < res.max_size { - limit_size(&mut ents, Some(max_size)); - }; - let count = ents.len(); - buf.append(&mut ents); - return Ok(count); - } else if last + RAFT_LOG_MULTI_GET_CNT > high - 1 - && res.tried_cnt + 1 == MAX_ASYNC_FETCH_TRY_CNT - { - let mut fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size() as u64); - if max_size <= fetched_size { - limit_size(&mut ents, Some(max_size)); - let count = ents.len(); - buf.append(&mut ents); - return Ok(count); - } - - // the count of left entries isn't too large, fetch the remaining entries synchronously one by one - for idx in last + 1..high { - let ent = self.engines.raft.get_entry(region_id, idx)?; - match ent { - None => { - return Err(raft::Error::Store(raft::StorageError::Unavailable)); - } - Some(ent) => { - let size = ent.compute_size() as u64; - if fetched_size + size > max_size { - break; - } else { - fetched_size += size; - ents.push(ent); - } - } - } - } - let count = ents.len(); - buf.append(&mut ents); - return Ok(count); - } - info!( - "async fetch invalid"; - "region_id" => self.region.get_id(), - "peer_id" => self.peer_id, - "first" => first, - "last" => last, - "low" => low, - "high" => high, - "max_size" => max_size, - "res_max_size" => res.max_size, - ); - // low index or max size is changed, the result is not fit for the current range, so refetch again. - self.raftlog_fetch_stats.fetch_invalid.update(|m| m + 1); - res.tried_cnt + 1 - } else { - 1 - }; - - // the first/second try: get [low, high) asynchronously - // the third try: - // - if term and low are matched: use result of [low, persisted) and get [persisted, high) synchronously - // - else: get [low, high) synchronously - if tried_cnt >= MAX_ASYNC_FETCH_TRY_CNT { - // even the larger range is invalid again, fallback to fetch in sync way - self.raftlog_fetch_stats.fallback_fetch.update(|m| m + 1); - let count = self.engines.raft.fetch_entries_to( - region_id, - low, - high, - Some(max_size as usize), - buf, - )?; - return Ok(count); - } - - self.raftlog_fetch_stats.async_fetch.update(|m| m + 1); - self.async_fetch_results - .borrow_mut() - .insert(low, RaftlogFetchState::Fetching); - self.raftlog_fetch_scheduler - .schedule(RaftlogFetchTask::PeerStorage { - region_id, - context, - low, - high, - max_size: (max_size as usize), - tried_cnt, - term: self.hard_state().get_term(), - }) - .unwrap(); - Err(raft::Error::Store( - raft::StorageError::LogTemporarilyUnavailable, - )) - } - - pub fn entries( - &self, - low: u64, - high: u64, - max_size: u64, - context: GetEntriesContext, - ) -> raft::Result> { - self.check_range(low, high)?; - let mut ents = - Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); - if low == high { - return Ok(ents); - } - let region_id = self.get_region_id(); - let cache_low = self.cache.first_index().unwrap_or(u64::MAX); - if high <= cache_low { - self.cache.miss.update(|m| m + 1); - return if context.can_async() { - self.async_fetch(region_id, low, high, max_size, context, &mut ents)?; - Ok(ents) - } else { - self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); - self.engines.raft.fetch_entries_to( - region_id, - low, - high, - Some(max_size as usize), - &mut ents, - )?; - Ok(ents) - }; - } - let begin_idx = if low < cache_low { - self.cache.miss.update(|m| m + 1); - let fetched_count = if context.can_async() { - self.async_fetch(region_id, low, cache_low, max_size, context, &mut ents)? - } else { - self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); - self.engines.raft.fetch_entries_to( - region_id, - low, - cache_low, - Some(max_size as usize), - &mut ents, - )? - }; - if fetched_count < (cache_low - low) as usize { - // Less entries are fetched than expected. - return Ok(ents); - } - cache_low - } else { - low - }; - self.cache.hit.update(|h| h + 1); - let fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size()); - self.cache - .fetch_entries_to(begin_idx, high, fetched_size as u64, max_size, &mut ents); - Ok(ents) - } - - pub fn term(&self, idx: u64) -> raft::Result { - if idx == self.truncated_index() { - return Ok(self.truncated_term()); - } - self.check_range(idx, idx + 1)?; - if self.truncated_term() == self.last_term || idx == self.last_index() { - return Ok(self.last_term); - } - if let Some(e) = self.cache.entry(idx) { - Ok(e.get_term()) - } else { - Ok(self - .engines - .raft - .get_entry(self.get_region_id(), idx) - .unwrap() - .unwrap() - .get_term()) - } - } - - #[inline] - pub fn first_index(&self) -> u64 { - first_index(&self.apply_state) - } - - #[inline] - pub fn last_index(&self) -> u64 { - last_index(&self.raft_state) - } - - #[inline] - pub fn last_term(&self) -> u64 { - self.last_term - } - - #[inline] - pub fn raft_state(&self) -> &RaftLocalState { - &self.raft_state - } - - #[inline] - pub fn applied_index(&self) -> u64 { - self.apply_state.get_applied_index() - } - - #[inline] - pub fn set_applied_state(&mut self, apply_state: RaftApplyState) { - self.apply_state = apply_state; - } - - #[inline] - pub fn set_applied_term(&mut self, applied_index_term: u64) { - self.applied_index_term = applied_index_term; - } - - #[inline] - pub fn apply_state(&self) -> &RaftApplyState { - &self.apply_state - } - - #[inline] - pub fn applied_index_term(&self) -> u64 { - self.applied_index_term - } - - #[inline] - pub fn commit_index(&self) -> u64 { - self.raft_state.get_hard_state().get_commit() - } - - #[inline] - pub fn set_commit_index(&mut self, commit: u64) { - assert!(commit >= self.commit_index()); - self.raft_state.mut_hard_state().set_commit(commit); - } - - #[inline] - pub fn hard_state(&self) -> &HardState { - self.raft_state.get_hard_state() - } - - #[inline] - pub fn truncated_index(&self) -> u64 { - self.apply_state.get_truncated_state().get_index() - } - - #[inline] - pub fn truncated_term(&self) -> u64 { - self.apply_state.get_truncated_state().get_term() - } - #[inline] pub fn region(&self) -> &metapb::Region { &self.region @@ -1167,6 +366,7 @@ where #[inline] pub fn set_region(&mut self, region: metapb::Region) { + self.peer = find_peer_by_id(®ion, self.peer_id).cloned(); self.region = region; } @@ -1181,7 +381,7 @@ where snapshot_index: u64, kv_wb: &mut impl Mutable, ) -> Result<()> { - let mut snapshot_raft_state = self.raft_state.clone(); + let mut snapshot_raft_state = self.raft_state().clone(); snapshot_raft_state .mut_hard_state() .set_commit(snapshot_index); @@ -1200,7 +400,7 @@ where kv_wb.put_msg_cf( CF_RAFT, &keys::apply_state_key(self.region.get_id()), - &self.apply_state, + self.apply_state(), )?; Ok(()) } @@ -1249,41 +449,69 @@ where true } - /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no unavailable - /// snapshot. + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no + /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + fail_point!("ignore generate snapshot", self.peer_id == 1, |_| { + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + }); + if self.peer.as_ref().unwrap().is_witness { + // witness could be the leader for a while, do not generate snapshot now + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + + if find_peer_by_id(&self.region, to).map_or(false, |p| p.is_witness) { + // Although we always sending snapshot task behind apply task to get latest + // snapshot, we can't use `last_applying_idx` here, as below the judgment + // condition will generate an witness snapshot directly, the new non-witness + // will ingore this mismatch snapshot and can't request snapshot successfully + // again. + if self.applied_index() < request_index { + // It may be a request from non-witness. In order to avoid generating mismatch + // snapshots, wait for apply non-witness to complete + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + // generate an empty snapshot for witness directly + return Ok(util::new_empty_snapshot( + self.region.clone(), + self.applied_index(), + self.applied_term(), + true, // for witness + )); + } + let mut snap_state = self.snap_state.borrow_mut(); let mut tried_cnt = self.snap_tried_cnt.borrow_mut(); - let (mut tried, mut last_canceled, mut snap) = (false, false, None); + let mut tried = false; + let mut last_canceled = false; if let SnapState::Generating { - ref canceled, - ref receiver, - .. - } = *snap_state + canceled, receiver, .. + } = &*snap_state { tried = true; last_canceled = canceled.load(Ordering::SeqCst); match receiver.try_recv() { Err(TryRecvError::Empty) => { - let e = raft::StorageError::SnapshotTemporarilyUnavailable; - return Err(raft::Error::Store(e)); + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); } - Ok(s) if !last_canceled => snap = Some(s), - Err(TryRecvError::Disconnected) | Ok(_) => {} - } - } - - if tried { - *snap_state = SnapState::Relax; - match snap { - Some(s) => { + Ok(s) if !last_canceled => { + *snap_state = SnapState::Relax; *tried_cnt = 0; if self.validate_snap(&s, request_index) { return Ok(s); } } - None => { + Err(TryRecvError::Disconnected) | Ok(_) => { + *snap_state = SnapState::Relax; warn!( "failed to try generating snapshot"; "region_id" => self.region.get_id(), @@ -1299,7 +527,12 @@ where panic!("{} unexpected state: {:?}", self.tag, *snap_state); } - if *tried_cnt >= MAX_SNAP_TRY_CNT { + let max_snap_try_cnt = (|| { + fail_point!("ignore_snap_try_cnt", |_| usize::MAX); + MAX_SNAP_TRY_CNT + })(); + + if *tried_cnt >= max_snap_try_cnt { let cnt = *tried_cnt; *tried_cnt = 0; return Err(raft::Error::Store(box_err!( @@ -1307,6 +540,9 @@ where cnt ))); } + if !tried || !last_canceled { + *tried_cnt += 1; + } info!( "requesting snapshot"; @@ -1316,10 +552,6 @@ where "request_peer" => to, ); - if !tried || !last_canceled { - *tried_cnt += 1; - } - let (sender, receiver) = mpsc::sync_channel(1); let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); @@ -1328,130 +560,39 @@ where index: index.clone(), receiver, }; - let mut to_store_id = 0; - if let Some(peer) = self.region().get_peers().iter().find(|p| p.id == to) { - to_store_id = peer.store_id; - } - let task = GenSnapTask::new(self.region.get_id(), index, canceled, sender, to_store_id); + + let store_id = self + .region() + .get_peers() + .iter() + .find(|p| p.id == to) + .map(|p| p.store_id) + .unwrap_or(0); + let task = GenSnapTask::new(self.region.get_id(), index, canceled, sender, store_id); let mut gen_snap_task = self.gen_snap_task.borrow_mut(); assert!(gen_snap_task.is_none()); *gen_snap_task = Some(task); - Err(raft::Error::Store( - raft::StorageError::SnapshotTemporarilyUnavailable, - )) - } - - pub fn has_gen_snap_task(&self) -> bool { - self.gen_snap_task.borrow().is_some() - } - - pub fn mut_gen_snap_task(&mut self) -> &mut Option { - self.gen_snap_task.get_mut() - } - - pub fn take_gen_snap_task(&mut self) -> Option { - self.gen_snap_task.get_mut().take() - } - - // Append the given entries to the raft log using previous last index or self.last_index. - pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { - if entries.is_empty() { - return; - } - let region_id = self.get_region_id(); - debug!( - "append entries"; - "region_id" => region_id, - "peer_id" => self.peer_id, - "count" => entries.len(), - ); - let prev_last_index = self.raft_state.get_last_index(); - - let (last_index, last_term) = { - let e = entries.last().unwrap(); - (e.get_index(), e.get_term()) - }; - - self.cache.append(&self.tag, &entries); - - task.entries = entries; - // Delete any previously appended log entries which never committed. - task.cut_logs = Some((last_index + 1, prev_last_index + 1)); - - self.raft_state.set_last_index(last_index); - self.last_term = last_term; - } - - pub fn compact_to(&mut self, idx: u64) { - self.compact_cache_to(idx); - - self.cancel_generating_snap(Some(idx)); - } - - pub fn compact_cache_to(&mut self, idx: u64) { - self.cache.compact_to(idx); - let rid = self.get_region_id(); - if self.engines.raft.has_builtin_entry_cache() { - self.engines.raft.gc_entry_cache(rid, idx); - } - } - - #[inline] - pub fn is_cache_empty(&self) -> bool { - self.cache.is_empty() + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) } - pub fn maybe_gc_cache(&mut self, replicated_idx: u64, apply_idx: u64) { - if self.engines.raft.has_builtin_entry_cache() { - let rid = self.get_region_id(); - self.engines.raft.gc_entry_cache(rid, apply_idx + 1); - } - if replicated_idx == apply_idx { - // The region is inactive, clear the cache immediately. - self.cache.compact_to(apply_idx + 1); - return; - } - let cache_first_idx = match self.cache.first_index() { - None => return, - Some(idx) => idx, - }; - if cache_first_idx > replicated_idx + 1 { - // Catching up log requires accessing fs already, let's optimize for - // the common case. - // Maybe gc to second least replicated_idx is better. - self.cache.compact_to(apply_idx + 1); - } + pub fn has_gen_snap_task(&self) -> bool { + self.gen_snap_task.borrow().is_some() } - /// Evict entries from the cache. - pub fn evict_cache(&mut self, half: bool) { - if !self.cache.cache.is_empty() { - let cache = &mut self.cache; - let cache_len = cache.cache.len(); - let drain_to = if half { cache_len / 2 } else { cache_len - 1 }; - let idx = cache.cache[drain_to].index; - let mem_size_change = cache.compact_to(idx + 1); - RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); - } + pub fn mut_gen_snap_task(&mut self) -> &mut Option { + self.gen_snap_task.get_mut() } - pub fn cache_is_empty(&self) -> bool { - self.cache.cache.is_empty() + pub fn take_gen_snap_task(&mut self) -> Option { + self.gen_snap_task.get_mut().take() } - #[inline] - pub fn flush_cache_metrics(&mut self) { - // NOTE: memory usage of entry cache is flushed realtime. - self.cache.flush_stats(); - self.raftlog_fetch_stats.flush_stats(); - if self.engines.raft.has_builtin_entry_cache() { - if let Some(stats) = self.engines.raft.flush_stats() { - RAFT_ENTRIES_CACHES_GAUGE.set(stats.cache_size as i64); - RAFT_ENTRY_FETCHES.hit.inc_by(stats.hit as u64); - RAFT_ENTRY_FETCHES.miss.inc_by(stats.miss as u64); - } - } + pub fn on_compact_raftlog(&mut self, idx: u64) { + self.entry_storage.compact_entry_cache(idx); + self.cancel_generating_snap(Some(idx)); } // Apply the peer with given snapshot. @@ -1460,7 +601,7 @@ where snap: &Snapshot, task: &mut WriteTask, destroy_regions: &[metapb::Region], - ) -> Result { + ) -> Result<(metapb::Region, bool)> { info!( "begin to apply snapshot"; "region_id" => self.region.get_id(), @@ -1470,8 +611,9 @@ where let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; - let region_id = self.get_region_id(); + let for_witness = snap_data.get_meta().get_for_witness(); + let region_id = self.get_region_id(); let region = snap_data.take_region(); if region.get_id() != region_id { return Err(box_err!( @@ -1484,55 +626,63 @@ where if task.raft_wb.is_none() { task.raft_wb = Some(self.engines.raft.log_batch(64)); } - if task.kv_wb.is_none() { - task.kv_wb = Some(self.engines.kv.write_batch()); - } let raft_wb = task.raft_wb.as_mut().unwrap(); - let kv_wb = task.kv_wb.as_mut().unwrap(); + let kv_wb = task.extra_write.ensure_v1(|| self.engines.kv.write_batch()); if self.is_initialized() { // we can only delete the old data when the peer is initialized. - let first_index = self.first_index(); + let first_index = self.entry_storage.first_index(); // It's possible that logs between `last_compacted_idx` and `first_index` are // being deleted in raftlog_gc worker. But it's OK as: - // 1. If the peer accepts a new snapshot, it must start with an index larger than - // this `first_index`; - // 2. If the peer accepts new entries after this snapshot or new snapshot, it must - // start with the new applied index, which is larger than `first_index`. + // - If the peer accepts a new snapshot, it must start with an index larger than + // this `first_index`; + // - If the peer accepts new entries after this snapshot or new snapshot, it + // must start with the new applied index, which is larger than `first_index`. // So new logs won't be deleted by on going raftlog_gc task accidentally. // It's possible that there will be some logs between `last_compacted_idx` and - // `first_index` are not deleted. So a cleanup task for the range should be triggered - // after applying the snapshot. + // `first_index` are not deleted. So a cleanup task for the range should be + // triggered after applying the snapshot. self.clear_meta(first_index, kv_wb, raft_wb)?; } // Write its source peers' `RegionLocalState` together with itself for atomicity for r in destroy_regions { write_peer_state(kv_wb, r, PeerState::Tombstone, None)?; } - write_peer_state(kv_wb, ®ion, PeerState::Applying, None)?; - let last_index = snap.get_metadata().get_index(); + // Witness snapshot is applied atomically as no async applying operation to + // region worker, so no need to set the peer state to `Applying` + let state = if for_witness { + PeerState::Normal + } else { + PeerState::Applying + }; + write_peer_state(kv_wb, ®ion, state, None)?; + + let snap_index = snap.get_metadata().get_index(); + let snap_term = snap.get_metadata().get_term(); - self.raft_state.set_last_index(last_index); - self.last_term = snap.get_metadata().get_term(); - self.apply_state.set_applied_index(last_index); - self.applied_index_term = self.last_term; + self.raft_state_mut().set_last_index(snap_index); + self.set_last_term(snap_term); + self.apply_state_mut().set_applied_index(snap_index); + self.set_applied_term(snap_term); // The snapshot only contains log which index > applied index, so // here the truncate state's (index, term) is in snapshot metadata. - self.apply_state.mut_truncated_state().set_index(last_index); - self.apply_state + self.apply_state_mut() + .mut_truncated_state() + .set_index(snap_index); + self.apply_state_mut() .mut_truncated_state() - .set_term(snap.get_metadata().get_term()); + .set_term(snap_term); // `region` will be updated after persisting. // Although there is an interval that other metadata are updated while `region` // is not after handing snapshot from ready, at the time of writing, it's no // problem for now. - // The reason why the update of `region` is delayed is that we expect `region` stays - // consistent with the one in `StoreMeta::regions` which should be updated after - // persisting due to atomic snapshot and peer create process. So if we can fix - // these issues in future(maybe not?), the `region` and `StoreMeta::regions` + // The reason why the update of `region` is delayed is that we expect `region` + // stays consistent with the one in `StoreMeta::regions` which should be updated + // after persisting due to atomic snapshot and peer create process. So if we can + // fix these issues in future(maybe not?), the `region` and `StoreMeta::regions` // can updated here immediately. info!( @@ -1540,10 +690,11 @@ where "region_id" => self.region.get_id(), "peer_id" => self.peer_id, "region" => ?region, - "state" => ?self.apply_state, + "state" => ?self.apply_state(), + "for_witness" => for_witness, ); - Ok(region) + Ok((region, for_witness)) } /// Delete all meta belong to the region. Results are stored in `wb`. @@ -1560,9 +711,9 @@ where raft_wb, region_id, first_index, - &self.raft_state, + self.raft_state(), )?; - self.cache = EntryCache::default(); + self.entry_storage.clear(); Ok(()) } @@ -1613,8 +764,8 @@ where Ok(()) } - pub fn get_raft_engine(&self) -> ER { - self.engines.raft.clone() + pub fn raft_engine(&self) -> &ER { + self.entry_storage.raft_engine() } /// Check whether the storage has finished applying snapshot. @@ -1654,7 +805,8 @@ where res } - /// Cancel applying snapshot, return true if the job can be considered not be run again. + /// Cancel applying snapshot, return true if the job can be considered not + /// be run again. pub fn cancel_applying_snap(&mut self) -> bool { let is_canceled = match *self.snap_state.borrow() { SnapState::Applying(ref status) => { @@ -1734,6 +886,7 @@ where let task = RegionTask::Apply { region_id: self.get_region_id(), status, + peer_id: self.peer_id, }; // Don't schedule the snapshot to region worker. @@ -1759,24 +912,27 @@ where destroy_regions: Vec, ) -> Result<(HandleReadyResult, WriteTask)> { let region_id = self.get_region_id(); - let prev_raft_state = self.raft_state.clone(); + let prev_raft_state = self.raft_state().clone(); let mut write_task = WriteTask::new(region_id, self.peer_id, ready.number()); - let mut res = HandleReadyResult::SendIOTask; - if !ready.snapshot().is_empty() { + let mut res = if ready.snapshot().is_empty() { + HandleReadyResult::SendIoTask + } else { fail_point!("raft_before_apply_snap"); - let last_first_index = self.first_index(); - let snap_region = + let last_first_index = self.first_index().unwrap(); + let (snap_region, for_witness) = self.apply_snapshot(ready.snapshot(), &mut write_task, &destroy_regions)?; - res = HandleReadyResult::Snapshot { + let res = HandleReadyResult::Snapshot(Box::new(HandleSnapshotResult { msgs: ready.take_persisted_messages(), snap_region, destroy_regions, last_first_index, - }; + for_witness, + })); fail_point!("raft_after_apply_snap"); + res }; if !ready.entries().is_empty() { @@ -1785,15 +941,15 @@ where // Last index is 0 means the peer is created from raft message // and has not applied snapshot yet, so skip persistent hard state. - if self.raft_state.get_last_index() > 0 { + if self.raft_state().get_last_index() > 0 { if let Some(hs) = ready.hs() { - self.raft_state.set_hard_state(hs.clone()); + self.raft_state_mut().set_hard_state(hs.clone()); } } // Save raft state if it has changed or there is a snapshot. - if prev_raft_state != self.raft_state || !ready.snapshot().is_empty() { - write_task.raft_state = Some(self.raft_state.clone()); + if prev_raft_state != *self.raft_state() || !ready.snapshot().is_empty() { + write_task.raft_state = Some(self.raft_state().clone()); } if !ready.snapshot().is_empty() { @@ -1803,22 +959,18 @@ where // in case of recv raft log after snapshot. self.save_snapshot_raft_state_to( ready.snapshot().get_metadata().get_index(), - write_task.kv_wb.as_mut().unwrap(), + write_task.extra_write.v1_mut().unwrap(), )?; - self.save_apply_state_to(write_task.kv_wb.as_mut().unwrap())?; + self.save_apply_state_to(write_task.extra_write.v1_mut().unwrap())?; } if !write_task.has_data() { - res = HandleReadyResult::NoIOTask; + res = HandleReadyResult::NoIoTask; } Ok((res, write_task)) } - pub fn update_cache_persisted(&mut self, persisted: u64) { - self.cache.update_persisted(persisted); - } - pub fn persist_snapshot(&mut self, res: &PersistSnapshotResult) { // cleanup data before scheduling apply task if self.is_initialized() { @@ -1835,14 +987,15 @@ where } } - // Note that the correctness depends on the fact that these source regions MUST NOT - // serve read request otherwise a corrupt data may be returned. + // Note that the correctness depends on the fact that these source regions MUST + // NOT serve read request otherwise a corrupt data may be returned. // For now, it is ensured by - // 1. After `PrepareMerge` log is committed, the source region leader's lease will be - // suspected immediately which makes the local reader not serve read request. - // 2. No read request can be responsed in peer fsm during merging. - // These conditions are used to prevent reading **stale** data in the past. - // At present, they are also used to prevent reading **corrupt** data. + // - After `PrepareMerge` log is committed, the source region leader's lease + // will be suspected immediately which makes the local reader not serve read + // request. + // - No read request can be responded in peer fsm during merging. These + // conditions are used to prevent reading **stale** data in the past. At + // present, they are also used to prevent reading **corrupt** data. for r in &res.destroy_regions { if let Err(e) = self.clear_extra_data(r, &res.region) { error!(?e; @@ -1852,17 +1005,20 @@ where } } - self.schedule_applying_snapshot(); + if !res.for_witness { + self.schedule_applying_snapshot(); + } else { + // Bypass apply snapshot process for witness as the snapshot is empty, so mark + // status as finished directly here + let status = Arc::new(AtomicUsize::new(JOB_STATUS_FINISHED)); + self.set_snap_state(SnapState::Applying(Arc::clone(&status))); + } - // The `region` is updated after persisting in order to stay consistent with the one - // in `StoreMeta::regions` (will be updated soon). + // The `region` is updated after persisting in order to stay consistent with the + // one in `StoreMeta::regions` (will be updated soon). // See comments in `apply_snapshot` for more details. self.set_region(res.region.clone()); } - - pub fn trace_cached_entries(&mut self, entries: CachedEntries) { - self.cache.trace_cached_entries(entries); - } } /// Delete all meta belong to the region. Results are stored in `wb`. @@ -1903,10 +1059,11 @@ pub fn do_snapshot( engine: &E, kv_snap: E::Snapshot, region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, for_balance: bool, allow_multi_files_snapshot: bool, + start: UnixSecs, ) -> raft::Result where E: KvEngine, @@ -1916,38 +1073,33 @@ where "region_id" => region_id, ); - let msg = kv_snap + let apply_state: RaftApplyState = kv_snap .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .map_err(into_other::<_, raft::Error>)?; - let apply_state: RaftApplyState = match msg { - None => { - return Err(storage_error(format!( - "could not load raft state of region {}", - region_id - ))); - } - Some(state) => state, - }; + .map_err(into_other::<_, raft::Error>) + .and_then(|v| { + v.ok_or_else(|| { + storage_error(format!("could not load raft state of region {}", region_id)) + }) + })?; assert_eq!(apply_state, last_applied_state); let key = SnapKey::new( region_id, - last_applied_index_term, + last_applied_term, apply_state.get_applied_index(), ); - mgr.register(key.clone(), SnapEntry::Generating); defer!(mgr.deregister(&key, &SnapEntry::Generating)); - let state: RegionLocalState = kv_snap + let region_state: RegionLocalState = kv_snap .get_msg_cf(CF_RAFT, &keys::region_state_key(key.region_id)) - .and_then(|res| match res { - None => Err(box_err!("region {} could not find region info", region_id)), - Some(state) => Ok(state), - }) - .map_err(into_other::<_, raft::Error>)?; - - if state.get_state() != PeerState::Normal { + .map_err(into_other::<_, raft::Error>) + .and_then(|v| { + v.ok_or_else(|| { + storage_error(format!("region {} could not find region info", region_id)) + }) + })?; + if region_state.get_state() != PeerState::Normal { return Err(storage_error(format!( "snap job for {} seems stale, skip.", region_id @@ -1955,38 +1107,29 @@ where } let mut snapshot = Snapshot::default(); - // Set snapshot metadata. snapshot.mut_metadata().set_index(key.idx); snapshot.mut_metadata().set_term(key.term); - - let conf_state = util::conf_state_from_region(state.get_region()); - snapshot.mut_metadata().set_conf_state(conf_state); - - let mut s = mgr.get_snapshot_for_building(&key)?; + snapshot + .mut_metadata() + .set_conf_state(util::conf_state_from_region(region_state.get_region())); // Set snapshot data. - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(state.get_region().clone()); - let mut stat = SnapshotStatistics::new(); - s.build( + let mut s = mgr.get_snapshot_for_building(&key)?; + let snap_data = s.build( engine, &kv_snap, - state.get_region(), - &mut snap_data, - &mut stat, + region_state.get_region(), allow_multi_files_snapshot, + for_balance, + start, )?; - snap_data.mut_meta().set_for_balance(for_balance); - let v = snap_data.write_to_bytes()?; - snapshot.set_data(v.into()); - - SNAPSHOT_KV_COUNT_HISTOGRAM.observe(stat.kv_count as f64); - SNAPSHOT_SIZE_HISTOGRAM.observe(stat.size as f64); + snapshot.set_data(snap_data.write_to_bytes()?.into()); Ok(snapshot) } -// When we bootstrap the region we must call this to initialize region local state first. +// When we bootstrap the region we must call this to initialize region local +// state first. pub fn write_initial_raft_state(raft_wb: &mut W, region_id: u64) -> Result<()> { let mut raft_state = RaftLocalState { last_index: RAFT_INIT_LOG_INDEX, @@ -2037,34 +1180,8 @@ pub fn write_peer_state( Ok(()) } -/// Committed entries sent to apply threads. -#[derive(Clone)] -pub struct CachedEntries { - pub range: Range, - // Entries and dangle size for them. `dangle` means not in entry cache. - entries: Arc, usize)>>, -} - -impl CachedEntries { - pub fn new(entries: Vec) -> Self { - assert!(!entries.is_empty()); - let start = entries.first().map(|x| x.index).unwrap(); - let end = entries.last().map(|x| x.index).unwrap() + 1; - let range = Range { start, end }; - CachedEntries { - entries: Arc::new(Mutex::new((entries, 0))), - range, - } - } - - /// Take cached entries and dangle size for them. `dangle` means not in entry cache. - pub fn take_entries(&self) -> (Vec, usize) { - mem::take(&mut *self.entries.lock().unwrap()) - } -} - #[cfg(test)] -mod tests { +pub mod tests { use std::{ cell::RefCell, path::Path, @@ -2088,42 +1205,31 @@ mod tests { Error as RaftError, GetEntriesContext, StorageError, }; use tempfile::{Builder, TempDir}; - use tikv_util::worker::{dummy_scheduler, LazyWorker, Scheduler, Worker}; + use tikv_util::{ + store::{new_peer, new_witness_peer}, + worker::{dummy_scheduler, LazyWorker, Scheduler, Worker}, + }; use super::*; use crate::{ coprocessor::CoprocessorHost, store::{ - async_io::write::write_to_db_for_test, + async_io::{read::ReadRunner, write::write_to_db_for_test}, bootstrap_store, + entry_storage::tests::validate_cache, fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, - worker::{RaftlogFetchRunner, RegionRunner, RegionTask}, + worker::{make_region_worker_raftstore_cfg, RegionRunner, RegionTask}, + AsyncReadNotifier, FetchedLogs, GenSnapRes, }, }; - impl EntryCache { - fn new_with_cb(cb: impl Fn(i64) + Send + 'static) -> Self { - let entry_cache = EntryCache { - persisted: 0, - cache: Default::default(), - trace: Default::default(), - hit: Cell::new(0), - miss: Cell::new(0), - size_change_cb: Some(Box::new(cb) as Box), - }; - entry_cache.flush_mem_size_change(entry_cache.get_total_mem_size()); - entry_cache - } - } - fn new_storage( region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, path: &TempDir, ) -> PeerStorage { - let kv_db = engine_test::kv::new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None) - .unwrap(); + let kv_db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let raft_path = path.path().join(Path::new("raft")); let raft_db = engine_test::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(); let engines = Engines::new(kv_db, raft_db); @@ -2151,62 +1257,48 @@ mod tests { .unwrap() } - fn new_storage_from_ents( + pub fn new_storage_from_ents( region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, path: &TempDir, ents: &[Entry], ) -> PeerStorage { let mut store = new_storage(region_scheduler, raftlog_fetch_scheduler, path); - let mut write_task = WriteTask::new(store.get_region_id(), store.peer_id, 1); + let mut write_task: WriteTask = + WriteTask::new(store.get_region_id(), store.peer_id, 1); store.append(ents[1..].to_vec(), &mut write_task); store.update_cache_persisted(ents.last().unwrap().get_index()); store - .apply_state + .apply_state_mut() .mut_truncated_state() .set_index(ents[0].get_index()); store - .apply_state + .apply_state_mut() .mut_truncated_state() .set_term(ents[0].get_term()); store - .apply_state + .apply_state_mut() .set_applied_index(ents.last().unwrap().get_index()); - if write_task.kv_wb.is_none() { - write_task.kv_wb = Some(store.engines.kv.write_batch()); - } - store - .save_apply_state_to(write_task.kv_wb.as_mut().unwrap()) - .unwrap(); - write_task.raft_state = Some(store.raft_state.clone()); + let kv_wb = write_task + .extra_write + .ensure_v1(|| store.engines.kv.write_batch()); + store.save_apply_state_to(kv_wb).unwrap(); + write_task.raft_state = Some(store.raft_state().clone()); write_to_db_for_test(&store.engines, write_task); store } - fn append_ents(store: &mut PeerStorage, ents: &[Entry]) { + pub fn append_ents(store: &mut PeerStorage, ents: &[Entry]) { if ents.is_empty() { return; } let mut write_task = WriteTask::new(store.get_region_id(), store.peer_id, 1); store.append(ents.to_vec(), &mut write_task); - write_task.raft_state = Some(store.raft_state.clone()); + write_task.raft_state = Some(store.raft_state().clone()); write_to_db_for_test(&store.engines, write_task); } - fn validate_cache(store: &PeerStorage, exp_ents: &[Entry]) { - assert_eq!(store.cache.cache, exp_ents); - for e in exp_ents { - let entry = store - .engines - .raft - .get_entry(store.get_region_id(), e.get_index()) - .unwrap() - .unwrap(); - assert_eq!(entry, *e); - } - } - - fn new_entry(index: u64, term: u64) -> Entry { + pub fn new_entry(index: u64, term: u64) -> Entry { let mut e = Entry::default(); e.set_index(index); e.set_term(term); @@ -2276,7 +1368,7 @@ mod tests { store .engines .kv - .scan_cf(CF_RAFT, &meta_start, &meta_end, false, |_, _| { + .scan(CF_RAFT, &meta_start, &meta_end, false, |_, _| { count += 1; Ok(true) }) @@ -2289,7 +1381,7 @@ mod tests { store .engines .kv - .scan_cf(CF_RAFT, &raft_start, &raft_end, false, |_, _| { + .scan(CF_RAFT, &raft_start, &raft_end, false, |_, _| { count += 1; Ok(true) }) @@ -2344,41 +1436,31 @@ mod tests { store .engines .raft - .consume(&mut raft_wb, false /*sync*/) + .consume(&mut raft_wb, false /* sync */) .unwrap(); assert_eq!(left, get_meta_key_count(&store)); } } - use crate::{ - store::{SignificantMsg, SignificantRouter}, - Result as RaftStoreResult, - }; - - pub struct TestRouter { - ch: SyncSender>, + pub struct TestRouter { + ch: SyncSender, } - impl TestRouter { - pub fn new() -> (Self, Receiver>) { + impl TestRouter { + pub fn new() -> (Self, Receiver) { let (tx, rx) = sync_channel(1); (Self { ch: tx }, rx) } } - impl SignificantRouter for TestRouter - where - EK: KvEngine, - { - /// Sends a significant message. We should guarantee that the message can't be dropped. - fn significant_send( - &self, - _: u64, - msg: SignificantMsg, - ) -> RaftStoreResult<()> { - self.ch.send(msg).unwrap(); - Ok(()) + impl AsyncReadNotifier for TestRouter { + fn notify_logs_fetched(&self, _region_id: u64, fetched_logs: FetchedLogs) { + self.ch.send(fetched_logs).unwrap(); + } + + fn notify_snapshot_generated(&self, _region_id: u64, _res: GenSnapRes) { + unreachable!(); } } @@ -2453,24 +1535,16 @@ mod tests { let raftlog_fetch_scheduler = raftlog_fetch_worker.scheduler(); let mut store = new_storage_from_ents(region_scheduler, raftlog_fetch_scheduler, &td, &ents); - raftlog_fetch_worker.start(RaftlogFetchRunner::::new( - router, - store.engines.raft.clone(), - )); - store.compact_cache_to(5); + raftlog_fetch_worker.start(ReadRunner::new(router, store.engines.raft.clone())); + store.compact_entry_cache(5); let mut e = store.entries(lo, hi, maxsize, GetEntriesContext::empty(true)); if e == Err(raft::Error::Store( raft::StorageError::LogTemporarilyUnavailable, )) { let res = rx.recv().unwrap(); - match res { - SignificantMsg::RaftlogFetched { res, context } => { - store.update_async_fetch_res(lo, Some(res)); - count += 1; - e = store.entries(lo, hi, maxsize, context); - } - _ => unreachable!(), - }; + store.update_async_fetch_res(lo, Some(res.logs)); + count += 1; + e = store.entries(lo, hi, maxsize, res.context); } if e != wentries { panic!("#{}: expect entries {:?}, got {:?}", i, wentries, e); @@ -2480,257 +1554,6 @@ mod tests { assert_ne!(count, 0); } - #[test] - fn test_async_fetch() { - let ents = vec![ - new_entry(2, 2), - new_entry(3, 3), - new_entry(4, 4), - new_entry(5, 5), - new_entry(6, 6), - ]; - - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); - let region_scheduler = region_worker.scheduler(); - let (dummy_scheduler, _rx) = dummy_scheduler(); - let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); - - let max_u64 = u64::max_value(); - let mut tests = vec![ - // already compacted - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Err(RaftError::Store(StorageError::Compacted)), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::Compacted)), - vec![], - ), - // fetch partial entries due to max size limit - ( - 3, - 7, - 30, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: 30, - hit_size_limit: true, - tried_cnt: 1, - term: 1, - }, - Ok(3), - ents[1..4].to_vec(), - ), - // fetch all entries - ( - 2, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents.clone()), - low: 2, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(5), - ents.clone(), - ), - // high is smaller than before - ( - 3, - 5, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(2), - ents[1..3].to_vec(), - ), - // high is larger than before, second try - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::LogTemporarilyUnavailable)), - vec![], - ), - // high is larger than before, thrid try - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 2, - term: 1, - }, - Ok(4), - ents[1..].to_vec(), - ), - // max size is smaller than before - ( - 2, - 7, - 10, - 1, - RaftlogFetchResult { - ents: Ok(ents.clone()), - low: 2, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(2), - ents[..2].to_vec(), - ), - // max size is larger than before but with lower high - ( - 2, - 5, - 40, - 1, - RaftlogFetchResult { - ents: Ok(ents.clone()), - low: 2, - max_size: 30, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(3), - ents[..3].to_vec(), - ), - // low index is smaller than before - ( - 2, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Err(RaftError::Store(StorageError::Compacted)), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::LogTemporarilyUnavailable)), - vec![], - ), - // low index is larger than before - ( - 4, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(vec![]), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::LogTemporarilyUnavailable)), - vec![], - ), - // hit tried several lmit - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, - term: 1, - }, - Ok(4), - ents[1..5].to_vec(), - ), - // term is changed - ( - 3, - 7, - max_u64, - 2, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, - term: 1, - }, - Ok(4), - ents[1..5].to_vec(), - ), - ]; - - for (i, (lo, hi, maxsize, term, async_res, expected_res, expected_ents)) in - tests.drain(..).enumerate() - { - if async_res.low != lo { - store.clean_async_fetch_res(lo); - } else { - store.update_async_fetch_res(lo, Some(Box::new(async_res))); - } - let mut ents = vec![]; - store.raft_state.mut_hard_state().set_term(term); - let res = store.async_fetch( - store.get_region_id(), - lo, - hi, - maxsize, - GetEntriesContext::empty(true), - &mut ents, - ); - if res != expected_res { - panic!("#{}: expect result {:?}, got {:?}", i, expected_res, res); - } - if ents != expected_ents { - panic!("#{}: expect ents {:?}, got {:?}", i, expected_ents, ents); - } - } - } - // last_index and first_index are not mutated by PeerStorage on its own, // so we don't test them here. @@ -2749,10 +1572,9 @@ mod tests { let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - let res = store - .term(idx) - .map_err(From::from) - .and_then(|term| compact_raft_log(&store.tag, &mut store.apply_state, idx, term)); + let res = store.term(idx).map_err(From::from).and_then(|term| { + compact_raft_log(&store.tag, store.entry_storage.apply_state_mut(), idx, term) + }); // TODO check exact error type after refactoring error. if res.is_err() ^ werr.is_err() { panic!("#{}: want {:?}, got {:?}", i, werr, res); @@ -2807,23 +1629,23 @@ mod tests { let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let mut worker = Worker::new("region-worker").lazy_build("region-worker"); let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); let mut s = new_storage_from_ents(sched.clone(), dummy_scheduler, &td, &ents); let (router, _) = mpsc::sync_channel(100); + let cfg = make_region_worker_raftstore_cfg(true); let runner = RegionRunner::new( s.engines.kv.clone(), mgr, - 0, - true, - 2, + cfg, CoprocessorHost::::default(), router, Option::>::None, ); worker.start_with_timer(runner); - let snap = s.snapshot(0, 0); + let snap = s.snapshot(0, 1); let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); @@ -2847,11 +1669,11 @@ mod tests { let (tx, rx) = channel(); s.set_snap_state(gen_snap_for_test(rx)); // Empty channel should cause snapshot call to wait. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); tx.send(snap.clone()).unwrap(); - assert_eq!(s.snapshot(0, 0), Ok(snap.clone())); + assert_eq!(s.snapshot(0, 1), Ok(snap.clone())); assert_eq!(*s.snap_tried_cnt.borrow(), 0); let (tx, rx) = channel(); @@ -2872,18 +1694,17 @@ mod tests { let mut hs = HardState::default(); hs.set_commit(7); hs.set_term(5); - s.raft_state.set_hard_state(hs); - s.raft_state.set_last_index(7); - s.apply_state.set_applied_index(7); - write_task.raft_state = Some(s.raft_state.clone()); - if write_task.kv_wb.is_none() { - write_task.kv_wb = Some(s.engines.kv.write_batch()); - } - s.save_apply_state_to(write_task.kv_wb.as_mut().unwrap()) - .unwrap(); + s.raft_state_mut().set_hard_state(hs); + s.raft_state_mut().set_last_index(7); + s.apply_state_mut().set_applied_index(7); + write_task.raft_state = Some(s.raft_state().clone()); + let kv_wb = write_task + .extra_write + .ensure_v1(|| s.engines.kv.write_batch()); + s.save_apply_state_to(kv_wb).unwrap(); write_to_db_for_test(&s.engines, write_task); let term = s.term(7).unwrap(); - compact_raft_log(&s.tag, &mut s.apply_state, 7, term).unwrap(); + compact_raft_log(&s.tag, s.entry_storage.apply_state_mut(), 7, term).unwrap(); let mut kv_wb = s.engines.kv.write_batch(); s.save_apply_state_to(&mut kv_wb).unwrap(); kv_wb.write().unwrap(); @@ -2893,7 +1714,7 @@ mod tests { s.set_snap_state(gen_snap_for_test(rx)); *s.snap_tried_cnt.borrow_mut() = 1; // stale snapshot should be abandoned, snapshot index < truncated index. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); @@ -2910,7 +1731,7 @@ mod tests { ref s => panic!("unexpected state {:?}", s), } // Disconnected channel should trigger another try. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap_err(); assert_eq!(*s.snap_tried_cnt.borrow(), 2); @@ -2925,13 +1746,13 @@ mod tests { } // Scheduled job failed should trigger . - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap_err(); } // When retry too many times, it should report a different error. - match s.snapshot(0, 0) { + match s.snapshot(0, 1) { Err(RaftError::Store(StorageError::Other(_))) => {} res => panic!("unexpected res: {:?}", res), } @@ -2945,6 +1766,7 @@ mod tests { let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mut mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); mgr.set_enable_multi_snapshot_files(true); mgr.set_max_per_file_size(500); let mut worker = Worker::new("region-worker").lazy_build("region-worker"); @@ -2961,12 +1783,11 @@ mod tests { let store = new_store(1, labels); pd_client.add_store(store); let pd_mock = Arc::new(pd_client); + let cfg = make_region_worker_raftstore_cfg(true); let runner = RegionRunner::new( s.engines.kv.clone(), mgr, - 0, - true, - 2, + cfg, CoprocessorHost::::default(), router, Some(pd_mock), @@ -3009,243 +1830,78 @@ mod tests { } #[test] - fn test_storage_append() { + fn test_storage_create_snapshot_for_witness() { let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; - let mut tests = vec![ - ( - vec![new_entry(4, 6), new_entry(5, 6)], - vec![new_entry(4, 6), new_entry(5, 6)], - ), - ( - vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], - vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], - ), - // truncate the existing entries and append - (vec![new_entry(4, 5)], vec![new_entry(4, 5)]), - // direct append - ( - vec![new_entry(6, 5)], - vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], - ), - ]; - for (i, (entries, wentries)) in tests.drain(..).enumerate() { - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let worker = LazyWorker::new("snap-manager"); - let sched = worker.scheduler(); - let (dummy_scheduler, _) = dummy_scheduler(); - let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - append_ents(&mut store, &entries); - let li = store.last_index(); - let actual_entries = store - .entries(4, li + 1, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - if actual_entries != wentries { - panic!("#{}: want {:?}, got {:?}", i, wentries, actual_entries); - } - } - } + let mut cs = ConfState::default(); + cs.set_voters(vec![1, 2, 3]); - #[test] - fn test_storage_cache_fetch() { - let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let worker = LazyWorker::new("snap-manager"); + let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); + let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); + let mut worker = Worker::new("region-worker").lazy_build("region-worker"); let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); - let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - store.cache.cache.clear(); - // empty cache should fetch data from rocksdb directly. - let mut res = store - .entries(4, 6, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(*res, ents[1..]); - - let entries = vec![new_entry(6, 5), new_entry(7, 5)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); + let mut s = new_storage_from_ents(sched.clone(), dummy_scheduler, &td, &ents); + let cfg = make_region_worker_raftstore_cfg(true); + let (router, _) = mpsc::sync_channel(100); + let runner = RegionRunner::new( + s.engines.kv.clone(), + mgr, + cfg, + CoprocessorHost::::default(), + router, + Option::>::None, + ); + worker.start_with_timer(runner); - // direct cache access - res = store - .entries(6, 8, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(res, entries); + let mut r = s.region().clone(); + r.mut_peers().push(new_peer(2, 2)); + r.mut_peers().push(new_witness_peer(3, 3)); - // size limit should be supported correctly. - res = store - .entries(4, 8, 0, GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(res, vec![new_entry(4, 4)]); - let mut size = ents[1..].iter().map(|e| u64::from(e.compute_size())).sum(); - res = store - .entries(4, 8, size, GetEntriesContext::empty(false)) - .unwrap(); - let mut exp_res = ents[1..].to_vec(); - assert_eq!(res, exp_res); - for e in &entries { - size += u64::from(e.compute_size()); - exp_res.push(e.clone()); - res = store - .entries(4, 8, size, GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(res, exp_res); - } + let mut kv_wb = s.engines.kv.write_batch(); + write_peer_state(&mut kv_wb, &r, PeerState::Normal, None).unwrap(); + kv_wb.write().unwrap(); + s.set_region(r); - // range limit should be supported correctly. - for low in 4..9 { - for high in low..9 { - let res = store - .entries(low, high, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(*res, exp_res[low as usize - 4..high as usize - 4]); + let wait_snapshot = |snap: raft::Result| -> Snapshot { + if let Ok(s) = snap { + return s; } - } - } - - #[test] - fn test_storage_cache_update() { - let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let worker = LazyWorker::new("snap-manager"); - let sched = worker.scheduler(); - let (dummy_scheduler, _) = dummy_scheduler(); - let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - store.cache.cache.clear(); - - // initial cache - let mut entries = vec![new_entry(6, 5), new_entry(7, 5)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // rewrite - entries = vec![new_entry(6, 6), new_entry(7, 6)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // rewrite old entry - entries = vec![new_entry(5, 6), new_entry(6, 6)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // partial rewrite - entries = vec![new_entry(6, 7), new_entry(7, 7)]; - append_ents(&mut store, &entries); - let mut exp_res = vec![new_entry(5, 6), new_entry(6, 7), new_entry(7, 7)]; - validate_cache(&store, &exp_res); - - // direct append - entries = vec![new_entry(8, 7), new_entry(9, 7)]; - append_ents(&mut store, &entries); - exp_res.extend_from_slice(&entries); - validate_cache(&store, &exp_res); - - // rewrite middle - entries = vec![new_entry(7, 8)]; - append_ents(&mut store, &entries); - exp_res.truncate(2); - exp_res.push(new_entry(7, 8)); - validate_cache(&store, &exp_res); - - // compact to min(5 + 1, 7) - store.cache.persisted = 5; - store.compact_to(7); - exp_res = vec![new_entry(6, 7), new_entry(7, 8)]; - validate_cache(&store, &exp_res); - - // compact to min(7 + 1, 7) - store.cache.persisted = 7; - store.compact_to(7); - exp_res = vec![new_entry(7, 8)]; - validate_cache(&store, &exp_res); - // compact all - store.compact_to(8); - validate_cache(&store, &[]); - // invalid compaction should be ignored. - store.compact_to(6); - } - - #[test] - fn test_storage_cache_size_change() { - let new_padded_entry = |index: u64, term: u64, pad_len: usize| { - let mut e = new_entry(index, term); - e.data = vec![b'x'; pad_len].into(); - e + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + assert_eq!(*s.snap_tried_cnt.borrow(), 1); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap(); + let snap = match *s.snap_state.borrow() { + SnapState::Generating { ref receiver, .. } => { + receiver.recv_timeout(Duration::from_secs(3)).unwrap() + } + ref s => panic!("unexpected state: {:?}", s), + }; + snap }; - // Test the initial data structure size. - let (tx, rx) = mpsc::sync_channel(8); - let mut cache = EntryCache::new_with_cb(move |c: i64| tx.send(c).unwrap()); - assert_eq!(rx.try_recv().unwrap(), 896); - - cache.append( - "", - &[new_padded_entry(101, 1, 1), new_padded_entry(102, 1, 2)], - ); - assert_eq!(rx.try_recv().unwrap(), 3); - - // Test size change for one overlapped entry. - cache.append("", &[new_padded_entry(102, 2, 3)]); - assert_eq!(rx.try_recv().unwrap(), 1); - - // Test size change for all overlapped entries. - cache.append( - "", - &[new_padded_entry(101, 3, 4), new_padded_entry(102, 3, 5)], - ); - assert_eq!(rx.try_recv().unwrap(), 5); - - cache.append("", &[new_padded_entry(103, 3, 6)]); - assert_eq!(rx.try_recv().unwrap(), 6); - - // Test trace a dangle entry. - let cached_entries = CachedEntries::new(vec![new_padded_entry(100, 1, 1)]); - cache.trace_cached_entries(cached_entries); - assert_eq!(rx.try_recv().unwrap(), 1); - - // Test trace an entry which is still in cache. - let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); - cache.trace_cached_entries(cached_entries); - assert_eq!(rx.try_recv().unwrap(), 0); - - // Test compare `cached_last` with `trunc_to_idx` in `EntryCache::append_impl`. - cache.append("", &[new_padded_entry(103, 4, 7)]); - assert_eq!(rx.try_recv().unwrap(), 1); - - // Test compact one traced dangle entry and one entry in cache. - cache.persisted = 101; - cache.compact_to(102); - assert_eq!(rx.try_recv().unwrap(), -5); - - // Test compact the last traced dangle entry. - cache.persisted = 102; - cache.compact_to(103); - assert_eq!(rx.try_recv().unwrap(), -5); - - // Test compact all entries. - cache.persisted = 103; - cache.compact_to(104); - assert_eq!(rx.try_recv().unwrap(), -7); + // generate snapshot for peer + let snap = wait_snapshot(s.snapshot(0, 2)); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert!(!snap.get_data().is_empty()); - drop(cache); - assert_eq!(rx.try_recv().unwrap(), -896); - } + // generate snapshot for witness peer + let snap = wait_snapshot(s.snapshot(0, 3)); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert!(!snap.get_data().is_empty()); - #[test] - fn test_storage_cache_entry() { - let mut cache = EntryCache::default(); - let ents = vec![ - new_entry(3, 3), - new_entry(4, 4), - new_entry(5, 4), - new_entry(6, 6), - ]; - cache.append("", &ents); - assert!(cache.entry(1).is_none()); - assert!(cache.entry(2).is_none()); - for e in &ents { - assert_eq!(e, cache.entry(e.get_index()).unwrap()); + let mut data = RaftSnapshotData::default(); + protobuf::Message::merge_from_bytes(&mut data, snap.get_data()).unwrap(); + assert_eq!(data.get_region().get_id(), 1); + assert_eq!(data.get_region().get_peers().len(), 3); + let files = data.get_meta().get_cf_files(); + for file in files { + assert_eq!(file.get_size(), 0); } - let res = panic_hook::recover_safe(|| cache.entry(7)); - assert!(res.is_err()); } #[test] @@ -3262,23 +1918,23 @@ mod tests { let td1 = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let mut worker = LazyWorker::new("snap-manager"); let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); let s1 = new_storage_from_ents(sched.clone(), dummy_scheduler.clone(), &td1, &ents); let (router, _) = mpsc::sync_channel(100); + let cfg = make_region_worker_raftstore_cfg(true); let runner = RegionRunner::new( s1.engines.kv.clone(), mgr, - 0, - true, - 2, + cfg, CoprocessorHost::::default(), router, Option::>::None, ); worker.start(runner); - assert!(s1.snapshot(0, 0).is_err()); + s1.snapshot(0, 1).unwrap_err(); let gen_task = s1.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s1.engines, &sched).unwrap(); @@ -3294,18 +1950,18 @@ mod tests { let td2 = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let mut s2 = new_storage(sched.clone(), dummy_scheduler.clone(), &td2); - assert_eq!(s2.first_index(), s2.applied_index() + 1); + assert_eq!(s2.first_index(), Ok(s2.applied_index() + 1)); let mut write_task = WriteTask::new(s2.get_region_id(), s2.peer_id, 1); - let snap_region = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); + let (snap_region, _) = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); - assert_eq!(s2.last_term, snap1.get_metadata().get_term()); - assert_eq!(s2.apply_state.get_applied_index(), 6); - assert_eq!(s2.raft_state.get_last_index(), 6); - assert_eq!(s2.apply_state.get_truncated_state().get_index(), 6); - assert_eq!(s2.apply_state.get_truncated_state().get_term(), 6); - assert_eq!(s2.first_index(), s2.applied_index() + 1); + assert_eq!(s2.last_term(), snap1.get_metadata().get_term()); + assert_eq!(s2.apply_state().get_applied_index(), 6); + assert_eq!(s2.raft_state().get_last_index(), 6); + assert_eq!(s2.apply_state().get_truncated_state().get_index(), 6); + assert_eq!(s2.apply_state().get_truncated_state().get_term(), 6); + assert_eq!(s2.first_index(), Ok(s2.applied_index() + 1)); validate_cache(&s2, &[]); let td3 = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); @@ -3313,15 +1969,15 @@ mod tests { let mut s3 = new_storage_from_ents(sched, dummy_scheduler, &td3, ents); validate_cache(&s3, &ents[1..]); let mut write_task = WriteTask::new(s3.get_region_id(), s3.peer_id, 1); - let snap_region = s3.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); + let (snap_region, _) = s3.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); - assert_eq!(s3.last_term, snap1.get_metadata().get_term()); - assert_eq!(s3.apply_state.get_applied_index(), 6); - assert_eq!(s3.raft_state.get_last_index(), 6); - assert_eq!(s3.apply_state.get_truncated_state().get_index(), 6); - assert_eq!(s3.apply_state.get_truncated_state().get_term(), 6); + assert_eq!(s3.last_term(), snap1.get_metadata().get_term()); + assert_eq!(s3.apply_state().get_applied_index(), 6); + assert_eq!(s3.raft_state().get_last_index(), 6); + assert_eq!(s3.apply_state().get_truncated_state().get_index(), 6); + assert_eq!(s3.apply_state().get_truncated_state().get_term(), 6); validate_cache(&s3, &[]); } @@ -3369,7 +2025,7 @@ mod tests { JOB_STATUS_FAILED, )))); let res = panic_hook::recover_safe(|| s.cancel_applying_snap()); - assert!(res.is_err()); + res.unwrap_err(); } #[test] @@ -3419,7 +2075,7 @@ mod tests { JOB_STATUS_FAILED, )))); let res = panic_hook::recover_safe(|| s.check_applying_snap()); - assert!(res.is_err()); + res.unwrap_err(); } #[test] @@ -3429,8 +2085,7 @@ mod tests { let region_sched = region_worker.scheduler(); let raftlog_fetch_worker = LazyWorker::new("raftlog-fetch-worker"); let raftlog_fetch_sched = raftlog_fetch_worker.scheduler(); - let kv_db = - engine_test::kv::new_engine(td.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let kv_db = engine_test::kv::new_engine(td.path().to_str().unwrap(), ALL_CFS).unwrap(); let raft_path = td.path().join(Path::new("raft")); let raft_db = engine_test::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(); let engines = Engines::new(kv_db, raft_db); @@ -3456,32 +2111,35 @@ mod tests { let initial_state = s.initial_state().unwrap(); assert_eq!(initial_state.hard_state, *raft_state.get_hard_state()); + let mut lb = engines.raft.log_batch(4096); // last_index < commit_index is invalid. raft_state.set_last_index(11); - engines - .raft - .append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) .unwrap(); raft_state.mut_hard_state().set_commit(12); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); raft_state.set_last_index(20); let entries = (12..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.append(1, entries).unwrap(); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.append(1, None, entries).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); s = build_storage().unwrap(); let initial_state = s.initial_state().unwrap(); assert_eq!(initial_state.hard_state, *raft_state.get_hard_state()); // Missing last log is invalid. raft_state.set_last_index(21); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); raft_state.set_last_index(20); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); // applied_index > commit_index is invalid. let mut apply_state = RaftApplyState::default(); @@ -3498,7 +2156,8 @@ mod tests { assert!(build_storage().is_err()); // It should not recover if corresponding log doesn't exist. - engines.raft.gc(1, 14, 15).unwrap(); + engines.raft.gc(1, 14, 15, &mut lb).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); apply_state.set_commit_index(14); apply_state.set_commit_term(RAFT_INIT_LOG_TERM); engines @@ -3510,8 +2169,9 @@ mod tests { let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.gc(1, 0, 21).unwrap(); - engines.raft.append(1, entries).unwrap(); + engines.raft.gc(1, 0, 21, &mut lb).unwrap(); + lb.append(1, None, entries).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); s = build_storage().unwrap(); let initial_state = s.initial_state().unwrap(); @@ -3522,27 +2182,28 @@ mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); entries[0].set_term(RAFT_INIT_LOG_TERM - 1); - engines.raft.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); // hard state term miss match is invalid. let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM - 1); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); // last index < recorded_commit_index is invalid. - engines.raft.gc(1, 0, 21).unwrap(); + engines.raft.gc(1, 0, 21, &mut lb).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); - engines - .raft - .append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) .unwrap(); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); } diff --git a/components/raftstore/src/store/read_queue.rs b/components/raftstore/src/store/read_queue.rs index 9e6c9cf69f0..7ab0ca0cd93 100644 --- a/components/raftstore/src/store/read_queue.rs +++ b/components/raftstore/src/store/read_queue.rs @@ -4,7 +4,6 @@ use std::{cmp, collections::VecDeque, mem, u64, usize}; use collections::HashMap; -use engine_traits::Snapshot; use kvproto::{ kvrpcpb::LockInfo, raft_cmdpb::{self, RaftCmdRequest}, @@ -21,19 +20,17 @@ use tikv_util::{ use time::Timespec; use uuid::Uuid; +use super::msg::ErrorCallback; use crate::{ - store::{fsm::apply, metrics::*, Callback, Config}, + store::{fsm::apply, metrics::*, Config}, Result, }; const READ_QUEUE_SHRINK_SIZE: usize = 64; -pub struct ReadIndexRequest -where - S: Snapshot, -{ +pub struct ReadIndexRequest { pub id: Uuid, - cmds: MustConsumeVec<(RaftCmdRequest, Callback, Option)>, + cmds: MustConsumeVec<(RaftCmdRequest, C, Option)>, pub propose_time: Timespec, pub read_index: Option, pub addition_request: Option>, @@ -44,24 +41,16 @@ where cmds_heap_size: usize, } -impl ReadIndexRequest -where - S: Snapshot, -{ - const CMD_SIZE: usize = mem::size_of::<(RaftCmdRequest, Callback, Option)>(); +impl ReadIndexRequest { + const CMD_SIZE: usize = mem::size_of::<(RaftCmdRequest, C, Option)>(); - pub fn push_command(&mut self, req: RaftCmdRequest, cb: Callback, read_index: u64) { + pub fn push_command(&mut self, req: RaftCmdRequest, cb: C, read_index: u64) { RAFT_READ_INDEX_PENDING_COUNT.inc(); self.cmds_heap_size += req.heap_size(); self.cmds.push((req, cb, Some(read_index))); } - pub fn with_command( - id: Uuid, - req: RaftCmdRequest, - cb: Callback, - propose_time: Timespec, - ) -> Self { + pub fn with_command(id: Uuid, req: RaftCmdRequest, cb: C, propose_time: Timespec) -> Self { RAFT_READ_INDEX_PENDING_COUNT.inc(); // Ignore heap allocations for `Callback`. @@ -81,31 +70,25 @@ where } } - pub fn cmds(&self) -> &[(RaftCmdRequest, Callback, Option)] { - &*self.cmds + pub fn cmds(&self) -> &[(RaftCmdRequest, C, Option)] { + &self.cmds } - pub fn take_cmds(&mut self) -> MustConsumeVec<(RaftCmdRequest, Callback, Option)> { + pub fn take_cmds(&mut self) -> MustConsumeVec<(RaftCmdRequest, C, Option)> { self.cmds_heap_size = 0; self.cmds.take() } } -impl Drop for ReadIndexRequest -where - S: Snapshot, -{ +impl Drop for ReadIndexRequest { fn drop(&mut self) { let dur = (monotonic_raw_now() - self.propose_time).to_std().unwrap(); RAFT_READ_INDEX_PENDING_DURATION.observe(duration_to_sec(dur)); } } -pub struct ReadIndexQueue -where - S: Snapshot, -{ - reads: VecDeque>, +pub struct ReadIndexQueue { + reads: VecDeque>, ready_cnt: usize, // How many requests are handled. handled_cnt: usize, @@ -113,27 +96,33 @@ where contexts: HashMap, retry_countdown: usize, + tag: String, } -impl Default for ReadIndexQueue -where - S: Snapshot, -{ - fn default() -> ReadIndexQueue { +impl Default for ReadIndexQueue { + fn default() -> ReadIndexQueue { ReadIndexQueue { reads: VecDeque::new(), ready_cnt: 0, handled_cnt: 0, contexts: HashMap::default(), retry_countdown: 0, + tag: "".to_string(), } } } -impl ReadIndexQueue -where - S: Snapshot, -{ +impl ReadIndexQueue { + pub fn new(tag: String) -> ReadIndexQueue { + ReadIndexQueue { + reads: VecDeque::new(), + ready_cnt: 0, + handled_cnt: 0, + contexts: HashMap::default(), + retry_countdown: 0, + tag, + } + } /// Check it's necessary to retry pending read requests or not. /// Return true if all such conditions are satisfied: /// 1. more than an election timeout elapsed from the last request push; @@ -162,8 +151,9 @@ where self.ready_cnt != self.reads.len() } - /// Clear all commands in the queue. if `notify_removed` contains an `region_id`, - /// notify the request's callback that the region is removed. + /// Clear all commands in the queue. if `notify_removed` contains an + /// `region_id`, notify the request's callback that the region is + /// removed. pub fn clear_all(&mut self, notify_removed: Option) { let mut removed = 0; for mut read in self.reads.drain(..) { @@ -195,7 +185,7 @@ where self.contexts.clear(); } - pub fn push_back(&mut self, mut read: ReadIndexRequest, is_leader: bool) { + pub fn push_back(&mut self, mut read: ReadIndexRequest, is_leader: bool) { if !is_leader { read.in_contexts = true; let offset = self.handled_cnt + self.reads.len(); @@ -205,22 +195,22 @@ where self.retry_countdown = usize::MAX; } - pub fn back_mut(&mut self) -> Option<&mut ReadIndexRequest> { + pub fn back_mut(&mut self) -> Option<&mut ReadIndexRequest> { self.reads.back_mut() } - pub fn back(&self) -> Option<&ReadIndexRequest> { + pub fn back(&self) -> Option<&ReadIndexRequest> { self.reads.back() } - pub fn last_ready(&self) -> Option<&ReadIndexRequest> { + pub fn last_ready(&self) -> Option<&ReadIndexRequest> { if self.ready_cnt > 0 { return Some(&self.reads[self.ready_cnt - 1]); } None } - pub fn advance_leader_reads(&mut self, tag: &str, states: T) + pub fn advance_leader_reads(&mut self, states: T) where T: IntoIterator, u64)>, { @@ -236,7 +226,7 @@ where None => None, }; - error!("{} unexpected uuid detected", tag; "current_id" => ?invalid_id); + error!("{} unexpected uuid detected", &self.tag; "current_id" => ?invalid_id); let mut expect_id_track = vec![]; for i in (0..self.ready_cnt).rev().take(10).rev() { expect_id_track.push((i, self.reads.get(i).map(|r| (r.id, r.propose_time)))); @@ -251,7 +241,7 @@ where error!("context around"; "expect_id_track" => ?expect_id_track, "actual_id_track" => ?actual_id_track); panic!( "{} unexpected uuid detected {} != {:?} at {}", - tag, uuid, invalid_id, self.ready_cnt + &self.tag, uuid, invalid_id, self.ready_cnt ); } } @@ -332,7 +322,7 @@ where } } - pub fn pop_front(&mut self) -> Option> { + pub fn pop_front(&mut self) -> Option> { if self.ready_cnt == 0 { return None; } @@ -349,8 +339,9 @@ where Some(res) } - /// Raft could have not been ready to handle the poped task. So put it back into the queue. - pub fn push_front(&mut self, read: ReadIndexRequest) { + /// Raft could have not been ready to handle the poped task. So put it back + /// into the queue. + pub fn push_front(&mut self, read: ReadIndexRequest) { debug_assert!(read.read_index.is_some()); self.reads.push_front(read); self.ready_cnt += 1; @@ -442,10 +433,7 @@ mod memtrace { use super::*; - impl HeapSize for ReadIndexRequest - where - S: Snapshot, - { + impl HeapSize for ReadIndexRequest { fn heap_size(&self) -> usize { let mut size = self.cmds_heap_size + Self::CMD_SIZE * self.cmds.capacity(); if let Some(ref add) = self.addition_request { @@ -455,13 +443,10 @@ mod memtrace { } } - impl HeapSize for ReadIndexQueue - where - S: Snapshot, - { + impl HeapSize for ReadIndexQueue { #[inline] fn heap_size(&self) -> usize { - let mut size = self.reads.capacity() * mem::size_of::>() + let mut size = self.reads.capacity() * mem::size_of::>() // For one Uuid and one usize. + 24 * self.contexts.len(); for read in &self.reads { @@ -491,7 +476,8 @@ mod read_index_ctx_tests { } ); - // Old version TiKV should be able to parse context without lock checking fields. + // Old version TiKV should be able to parse context without lock checking + // fields. let bytes = ctx.to_bytes(); assert_eq!(bytes, id.as_bytes()); } @@ -519,10 +505,11 @@ mod tests { use engine_test::kv::KvTestSnapshot; use super::*; + use crate::store::Callback; #[test] fn test_read_queue_fold() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 125, ..Default::default() }; @@ -581,7 +568,7 @@ mod tests { #[test] fn test_become_leader_then_become_follower() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 100, ..Default::default() }; @@ -598,7 +585,7 @@ mod tests { // After the peer becomes leader, `advance` could be called before // `clear_uncommitted_on_role_change`. - queue.advance_leader_reads("", vec![(id, None, 10)]); + queue.advance_leader_reads(vec![(id, None, 10)]); while let Some(mut read) = queue.pop_front() { read.cmds.clear(); } @@ -613,7 +600,7 @@ mod tests { ); queue.push_back(req, true); let last_id = queue.reads.back().map(|t| t.id).unwrap(); - queue.advance_leader_reads("", vec![(last_id, None, 10)]); + queue.advance_leader_reads(vec![(last_id, None, 10)]); assert_eq!(queue.ready_cnt, 1); while let Some(mut read) = queue.pop_front() { read.cmds.clear(); @@ -625,7 +612,7 @@ mod tests { #[test] fn test_retake_leadership() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 100, ..Default::default() }; @@ -640,8 +627,9 @@ mod tests { ); queue.push_back(req, true); - // Advance on leader, but the peer is not ready to handle it (e.g. it's in merging). - queue.advance_leader_reads("", vec![(id, None, 10)]); + // Advance on leader, but the peer is not ready to handle it (e.g. it's in + // merging). + queue.advance_leader_reads(vec![(id, None, 10)]); // The leader steps down to follower, clear uncommitted reads. queue.clear_uncommitted_on_role_change(10); @@ -658,7 +646,7 @@ mod tests { queue.push_back(req, true); // Advance on leader again, shouldn't panic. - queue.advance_leader_reads("", vec![(id_1, None, 10)]); + queue.advance_leader_reads(vec![(id_1, None, 10)]); while let Some(mut read) = queue.pop_front() { read.cmds.clear(); } @@ -666,7 +654,7 @@ mod tests { #[test] fn test_advance_replica_reads_out_of_order() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 100, ..Default::default() }; diff --git a/src/server/status_server/region_meta.rs b/components/raftstore/src/store/region_meta.rs similarity index 66% rename from src/server/status_server/region_meta.rs rename to components/raftstore/src/store/region_meta.rs index cd78e7382c9..30239be528c 100644 --- a/src/server/status_server/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -2,9 +2,14 @@ use std::collections::HashMap; -use kvproto::metapb::PeerRole; -use raft::{Progress, ProgressState, StateRole}; -use raftstore::store::{AbstractPeer, GroupState}; +use kvproto::{ + metapb::{self, PeerRole}, + raft_serverpb, +}; +use raft::{Progress, ProgressState, StateRole, Status}; +use serde::{Deserialize, Serialize}; + +use super::GroupState; #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum RaftProgressState { @@ -55,7 +60,7 @@ pub struct RaftHardState { pub commit: u64, } -#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)] pub enum RaftStateRole { Follower, Candidate, @@ -88,6 +93,8 @@ pub struct RaftStatus { pub applied: u64, pub voters: HashMap, pub learners: HashMap, + pub last_index: u64, + pub persisted_index: u64, } impl<'a> From> for RaftStatus { @@ -121,11 +128,13 @@ impl<'a> From> for RaftStatus { applied, voters, learners, + last_index: 0, + persisted_index: 0, } } } -#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)] pub enum RaftPeerRole { Voter, Learner, @@ -144,6 +153,24 @@ impl From for RaftPeerRole { } } +impl From for PeerRole { + fn from(role: RaftPeerRole) -> Self { + match role { + RaftPeerRole::Voter => PeerRole::Voter, + RaftPeerRole::Learner => PeerRole::Learner, + RaftPeerRole::IncomingVoter => PeerRole::IncomingVoter, + RaftPeerRole::DemotingVoter => PeerRole::DemotingVoter, + } + } +} + +impl PartialEq for RaftPeerRole { + fn eq(&self, other: &PeerRole) -> bool { + let r: RaftPeerRole = (*other).into(); + *self == r + } +} + #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct Epoch { pub conf_ver: u64, @@ -155,6 +182,28 @@ pub struct RegionPeer { pub id: u64, pub store_id: u64, pub role: RaftPeerRole, + pub is_witness: bool, +} + +impl PartialEq for RegionPeer { + #[inline] + fn eq(&self, other: &metapb::Peer) -> bool { + // May not be sufficent, but always correct. + let s: metapb::Peer = (*self).into(); + s == *other + } +} + +impl From for metapb::Peer { + fn from(p: RegionPeer) -> Self { + metapb::Peer { + id: p.id, + store_id: p.store_id, + role: p.role.into(), + is_witness: p.is_witness, + ..Default::default() + } + } } #[derive(Debug, Copy, Clone, Serialize, Deserialize)] @@ -179,22 +228,37 @@ pub struct RaftApplyState { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RegionMeta { +pub struct RegionLocalState { pub id: u64, - pub group_state: GroupState, pub start_key: Vec, pub end_key: Vec, pub epoch: Epoch, pub peers: Vec, pub merge_state: Option, + pub tablet_index: u64, +} + +/// A serializeable struct that exposes the internal debug information of a +/// peer. TODO: make protobuf generated code derive serde directly. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RegionMeta { + pub group_state: GroupState, pub raft_status: RaftStatus, pub raft_apply: RaftApplyState, + pub region_state: RegionLocalState, + pub bucket_keys: Vec>, } impl RegionMeta { - pub fn new(abstract_peer: &dyn AbstractPeer) -> Self { - let region = abstract_peer.region(); - let apply_state = abstract_peer.apply_state(); + pub fn new( + local_state: &raft_serverpb::RegionLocalState, + apply_state: &raft_serverpb::RaftApplyState, + group_state: GroupState, + raft_status: Status<'_>, + last_index: u64, + persisted_index: u64, + ) -> Self { + let region = local_state.get_region(); let epoch = region.get_region_epoch(); let start_key = region.get_start_key(); let end_key = region.get_end_key(); @@ -205,27 +269,21 @@ impl RegionMeta { id: peer.get_id(), store_id: peer.get_store_id(), role: peer.get_role().into(), + is_witness: peer.is_witness, }); } + let merge_state = if local_state.has_merge_state() { + Some(local_state.get_merge_state()) + } else { + None + }; + let mut raft_status: RaftStatus = raft_status.into(); + raft_status.last_index = last_index; + raft_status.persisted_index = persisted_index; Self { - id: region.get_id(), - group_state: abstract_peer.group_state(), - start_key: start_key.to_owned(), - end_key: end_key.to_owned(), - epoch: Epoch { - conf_ver: epoch.get_conf_ver(), - version: epoch.get_version(), - }, - peers, - merge_state: abstract_peer - .pending_merge_state() - .map(|state| RegionMergeState { - min_index: state.get_min_index(), - commit: state.get_commit(), - region_id: state.get_target().get_id(), - }), - raft_status: abstract_peer.raft_status().into(), + group_state, + raft_status, raft_apply: RaftApplyState { applied_index: apply_state.get_applied_index(), commit_index: apply_state.get_commit_index(), @@ -235,6 +293,23 @@ impl RegionMeta { term: apply_state.get_truncated_state().get_term(), }, }, + region_state: RegionLocalState { + id: region.get_id(), + start_key: start_key.to_owned(), + end_key: end_key.to_owned(), + epoch: Epoch { + conf_ver: epoch.get_conf_ver(), + version: epoch.get_version(), + }, + peers, + merge_state: merge_state.map(|state| RegionMergeState { + min_index: state.get_min_index(), + commit: state.get_commit(), + region_id: state.get_target().get_id(), + }), + tablet_index: local_state.get_tablet_index(), + }, + bucket_keys: vec![], } } } diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 390c0ee0f5c..4073b71c60d 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -85,6 +85,11 @@ where self.snap.as_ref() } + #[inline] + pub fn set_apply_index(&self, apply_index: u64) { + self.apply_index.store(apply_index, Ordering::SeqCst); + } + #[inline] pub fn get_apply_index(&self) -> Result { let apply_index = self.apply_index.load(Ordering::SeqCst); @@ -109,12 +114,8 @@ where } } - pub fn iter(&self, iter_opt: IterOptions) -> RegionIterator { - RegionIterator::new(&self.snap, Arc::clone(&self.region), iter_opt) - } - - pub fn iter_cf(&self, cf: &str, iter_opt: IterOptions) -> Result> { - Ok(RegionIterator::new_cf( + pub fn iter(&self, cf: &str, iter_opt: IterOptions) -> Result> { + Ok(RegionIterator::new( &self.snap, Arc::clone(&self.region), iter_opt, @@ -122,26 +123,15 @@ where )) } - // scan scans database using an iterator in range [start_key, end_key), calls function f for - // each iteration, if f returns false, terminates this scan. - pub fn scan(&self, start_key: &[u8], end_key: &[u8], fill_cache: bool, f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, - { - let start = KeyBuilder::from_slice(start_key, DATA_PREFIX_KEY.len(), 0); - let end = KeyBuilder::from_slice(end_key, DATA_PREFIX_KEY.len(), 0); - let iter_opt = IterOptions::new(Some(start), Some(end), fill_cache); - self.scan_impl(self.iter(iter_opt), start_key, f) - } - - // like `scan`, only on a specific column family. - pub fn scan_cf( + // scan scans database using an iterator in range [start_key, end_key), calls + // function f for each iteration, if f returns false, terminates this scan. + pub fn scan( &self, cf: &str, start_key: &[u8], end_key: &[u8], fill_cache: bool, - f: F, + mut f: F, ) -> Result<()> where F: FnMut(&[u8], &[u8]) -> Result, @@ -149,13 +139,8 @@ where let start = KeyBuilder::from_slice(start_key, DATA_PREFIX_KEY.len(), 0); let end = KeyBuilder::from_slice(end_key, DATA_PREFIX_KEY.len(), 0); let iter_opt = IterOptions::new(Some(start), Some(end), fill_cache); - self.scan_impl(self.iter_cf(cf, iter_opt)?, start_key, f) - } - fn scan_impl(&self, mut it: RegionIterator, start_key: &[u8], mut f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, - { + let mut it = self.iter(cf, iter_opt)?; let mut it_valid = it.seek(start_key)?; while it_valid { it_valid = f(it.key(), it.value())? && it.next()?; @@ -195,13 +180,13 @@ impl Peekable for RegionSnapshot where S: Snapshot, { - type DBVector = ::DBVector; + type DbVector = ::DbVector; fn get_value_opt( &self, opts: &ReadOptions, key: &[u8], - ) -> EngineResult> { + ) -> EngineResult> { check_key_in_range( key, self.region.get_id(), @@ -220,7 +205,7 @@ where opts: &ReadOptions, cf: &str, key: &[u8], - ) -> EngineResult> { + ) -> EngineResult> { check_key_in_range( key, self.region.get_id(), @@ -300,16 +285,7 @@ impl RegionIterator where S: Snapshot, { - pub fn new(snap: &S, region: Arc, mut iter_opt: IterOptions) -> RegionIterator { - update_lower_bound(&mut iter_opt, ®ion); - update_upper_bound(&mut iter_opt, ®ion); - let iter = snap - .iterator_opt(iter_opt) - .expect("creating snapshot iterator"); // FIXME error handling - RegionIterator { iter, region } - } - - pub fn new_cf( + pub fn new( snap: &S, region: Arc, mut iter_opt: IterOptions, @@ -318,7 +294,7 @@ where update_lower_bound(&mut iter_opt, ®ion); update_upper_bound(&mut iter_opt, ®ion); let iter = snap - .iterator_cf_opt(cf, iter_opt) + .iterator_opt(cf, iter_opt) .expect("creating snapshot iterator"); // FIXME error handling RegionIterator { iter, region } } @@ -337,15 +313,13 @@ where }); self.should_seekable(key)?; let key = keys::data_key(key); - self.iter.seek(key.as_slice().into()).map_err(Error::from) + self.iter.seek(&key).map_err(Error::from) } pub fn seek_for_prev(&mut self, key: &[u8]) -> Result { self.should_seekable(key)?; let key = keys::data_key(key); - self.iter - .seek_for_prev(key.as_slice().into()) - .map_err(Error::from) + self.iter.seek_for_prev(&key).map_err(Error::from) } pub fn prev(&mut self) -> Result { @@ -397,7 +371,7 @@ fn handle_check_key_in_region_error(e: crate::Error) -> Result<()> { #[cfg(test)] mod tests { use engine_test::{kv::KvTestSnapshot, new_temp_engine}; - use engine_traits::{Engines, KvEngine, Peekable, RaftEngine, SyncMutable}; + use engine_traits::{Engines, KvEngine, Peekable, RaftEngine, SyncMutable, CF_DEFAULT}; use keys::data_key; use kvproto::metapb::{Peer, Region}; use tempfile::Builder; @@ -491,7 +465,7 @@ mod tests { let db = &engines.kv; for &(ref k, level) in &levels { db.put(&data_key(k), k).unwrap(); - db.flush(true).unwrap(); + db.flush_cfs(&[], true).unwrap(); data.push((k.to_vec(), k.to_vec())); db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(level)) .unwrap(); @@ -523,7 +497,7 @@ mod tests { assert!(v0.is_none()); let v4 = snap.get_value(b"key5"); - assert!(v4.is_err()); + v4.unwrap_err(); } #[allow(clippy::type_complexity)] @@ -548,7 +522,7 @@ mod tests { upper_bound.map(|v| KeyBuilder::from_slice(v, keys::DATA_PREFIX_KEY.len(), 0)), true, ); - let mut iter = snap.iter(iter_opt); + let mut iter = snap.iter(CF_DEFAULT, iter_opt).unwrap(); for (seek_key, in_range, seek_exp, prev_exp) in seek_table.clone() { let check_res = |iter: &RegionIterator, res: Result, @@ -650,7 +624,7 @@ mod tests { let snap = RegionSnapshot::::new(&store); let mut data = vec![]; - snap.scan(b"a2", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"a2", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -660,7 +634,7 @@ mod tests { assert_eq!(data, &base_data[1..3]); data.clear(); - snap.scan(b"a2", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"a2", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(false) }) @@ -668,7 +642,7 @@ mod tests { assert_eq!(data.len(), 1); - let mut iter = snap.iter(IterOptions::default()); + let mut iter = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); assert!(iter.seek_to_first().unwrap()); let mut res = vec![]; loop { @@ -685,7 +659,7 @@ mod tests { let store = new_peer_storage(engines.clone(), ®ion); let snap = RegionSnapshot::::new(&store); data.clear(); - snap.scan(b"", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -694,7 +668,7 @@ mod tests { assert_eq!(data.len(), 5); assert_eq!(data, base_data); - let mut iter = snap.iter(IterOptions::default()); + let mut iter = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); assert!(iter.seek(b"a1").unwrap()); assert!(iter.seek_to_first().unwrap()); @@ -710,11 +684,16 @@ mod tests { // test iterator with upper bound let store = new_peer_storage(engines, ®ion); let snap = RegionSnapshot::::new(&store); - let mut iter = snap.iter(IterOptions::new( - None, - Some(KeyBuilder::from_slice(b"a5", DATA_PREFIX_KEY.len(), 0)), - true, - )); + let mut iter = snap + .iter( + CF_DEFAULT, + IterOptions::new( + None, + Some(KeyBuilder::from_slice(b"a5", DATA_PREFIX_KEY.len(), 0)), + true, + ), + ) + .unwrap(); assert!(iter.seek_to_first().unwrap()); let mut res = vec![]; loop { @@ -735,7 +714,7 @@ mod tests { let snap = RegionSnapshot::::new(&store); let mut iter_opt = IterOptions::default(); iter_opt.set_lower_bound(b"a3", 1); - let mut iter = snap.iter(iter_opt); + let mut iter = snap.iter(CF_DEFAULT, iter_opt).unwrap(); assert!(iter.seek_to_last().unwrap()); let mut res = vec![]; loop { diff --git a/components/raftstore/src/store/replication_mode.rs b/components/raftstore/src/store/replication_mode.rs index bf13b9e2364..5f4602cde05 100644 --- a/components/raftstore/src/store/replication_mode.rs +++ b/components/raftstore/src/store/replication_mode.rs @@ -93,11 +93,12 @@ impl StoreGroup { /// Gets the group ID of store. /// - /// Different version may indicates different label key. If version is less than - /// recorded one, then label key has to be changed, new value can't be mixed with - /// old values, so `None` is returned. If version is larger, then label key must - /// still matches. Because `recalculate` is called before updating regions' - /// replication status, so unchanged recorded version means unchanged label key. + /// Different version may indicates different label key. If version is less + /// than recorded one, then label key has to be changed, new value can't + /// be mixed with old values, so `None` is returned. If version is larger, + /// then label key must still matches. Because `recalculate` is called + /// before updating regions' replication status, so unchanged recorded + /// version means unchanged label key. #[inline] pub fn group_id(&self, version: u64, store_id: u64) -> Option { if version < self.version { @@ -191,15 +192,14 @@ impl GlobalReplicationState { #[cfg(test)] mod tests { - use std::panic; use kvproto::{ metapb, replication_modepb::{ReplicationMode, ReplicationStatus}, }; + use tikv_util::store::new_peer; use super::*; - use crate::store::util::new_peer; fn new_label(key: &str, value: &str) -> metapb::StoreLabel { metapb::StoreLabel { @@ -333,6 +333,6 @@ mod tests { .group .register_store(1, vec![label1.clone(), label3.clone()]) }); - assert!(res.is_err(), "existing group id can't be changed."); + res.unwrap_err(); } } diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs new file mode 100644 index 00000000000..a303a586935 --- /dev/null +++ b/components/raftstore/src/store/simple_write.rs @@ -0,0 +1,829 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::assert_matches::debug_assert_matches; + +use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; +use kvproto::{ + import_sstpb::SstMeta, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}, +}; +use protobuf::{CodedInputStream, Message}; +use slog::Logger; +use tikv_util::slog_panic; + +use crate::store::{msg::ErrorCallback, WriteCallback}; + +// MAGIC number to hint simple write codec is used. If it's a protobuf message, +// the first one or several bytes are for field tag, which can't be zero. +// TODO: use protobuf blob request seems better. +const MAGIC_PREFIX: u8 = 0x00; + +#[derive(Clone, Debug)] +pub struct SimpleWriteBinary { + buf: Box<[u8]>, + write_type: WriteType, +} + +impl SimpleWriteBinary { + /// Freeze the binary will forbid further batching. + pub fn freeze(&mut self) { + self.write_type = WriteType::Unspecified; + } + + #[inline] + pub fn data_size(&self) -> usize { + self.buf.len() + } +} + +/// We usually use `RaftCmdRequest` for read write request. But the codec is +/// not efficient enough for simple request. `SimpleWrite` is introduce to +/// make codec alloc less and fast. +#[derive(Debug)] +pub struct SimpleWriteReqEncoder +where + C: ErrorCallback + WriteCallback, +{ + header: Box, + buf: Vec, + channels: Vec, + size_limit: usize, + write_type: WriteType, + notify_proposed: bool, +} + +impl SimpleWriteReqEncoder +where + C: ErrorCallback + WriteCallback, +{ + /// Create a request encoder. + /// + /// If `notify_proposed` is true, channels will be called `notify_proposed` + /// when it's appended. + pub fn new( + header: Box, + bin: SimpleWriteBinary, + size_limit: usize, + notify_proposed: bool, + ) -> SimpleWriteReqEncoder { + let mut buf = Vec::with_capacity(256); + buf.push(MAGIC_PREFIX); + header.write_length_delimited_to_vec(&mut buf).unwrap(); + buf.extend_from_slice(&bin.buf); + + SimpleWriteReqEncoder { + header, + buf, + channels: vec![], + size_limit, + write_type: bin.write_type, + notify_proposed, + } + } + + /// Encode the simple write into the buffer. + /// + /// Return false if the buffer limit is reached or the binary type not + /// match. + #[inline] + pub fn amend(&mut self, header: &RaftRequestHeader, bin: &SimpleWriteBinary) -> bool { + if *self.header != *header { + return false; + } + if self.write_type == bin.write_type + && bin.write_type != WriteType::Unspecified + && self.buf.len() + bin.buf.len() < self.size_limit + { + self.buf.extend_from_slice(&bin.buf); + true + } else { + false + } + } + + #[inline] + pub fn data_size(&self) -> usize { + self.buf.len() + } + + #[inline] + pub fn encode(self) -> (Vec, Vec) { + (self.buf, self.channels) + } + + #[inline] + pub fn add_response_channel(&mut self, mut ch: C) { + if self.notify_proposed { + ch.notify_proposed(); + } + self.channels.push(ch); + } + + #[inline] + pub fn notify_proposed(&self) -> bool { + self.notify_proposed + } + + #[inline] + pub fn header(&self) -> &RaftRequestHeader { + &self.header + } +} + +#[derive(Debug)] +pub struct Put<'a> { + pub cf: &'a str, + pub key: &'a [u8], + pub value: &'a [u8], +} + +#[derive(Debug)] +pub struct Delete<'a> { + pub cf: &'a str, + pub key: &'a [u8], +} + +#[derive(Debug)] +pub struct DeleteRange<'a> { + pub cf: &'a str, + pub start_key: &'a [u8], + pub end_key: &'a [u8], + pub notify_only: bool, +} + +#[derive(Debug)] +pub enum SimpleWrite<'a> { + Put(Put<'a>), + Delete(Delete<'a>), + DeleteRange(DeleteRange<'a>), + Ingest(Vec), +} + +#[derive(Clone, Copy, Debug, PartialEq)] +enum WriteType { + Unspecified, + PutDelete, + DeleteRange, + Ingest, +} + +#[derive(Clone)] +pub struct SimpleWriteEncoder { + buf: Vec, + write_type: WriteType, +} + +impl SimpleWriteEncoder { + #[inline] + pub fn with_capacity(cap: usize) -> SimpleWriteEncoder { + SimpleWriteEncoder { + buf: Vec::with_capacity(cap), + write_type: WriteType::Unspecified, + } + } + + #[inline] + pub fn put(&mut self, cf: &str, key: &[u8], value: &[u8]) { + debug_assert_matches!( + self.write_type, + WriteType::Unspecified | WriteType::PutDelete + ); + encode(SimpleWrite::Put(Put { cf, key, value }), &mut self.buf); + self.write_type = WriteType::PutDelete; + } + + #[inline] + pub fn delete(&mut self, cf: &str, key: &[u8]) { + debug_assert_matches!( + self.write_type, + WriteType::Unspecified | WriteType::PutDelete + ); + encode(SimpleWrite::Delete(Delete { cf, key }), &mut self.buf); + self.write_type = WriteType::PutDelete; + } + + #[inline] + pub fn delete_range(&mut self, cf: &str, start_key: &[u8], end_key: &[u8], notify_only: bool) { + debug_assert_matches!( + self.write_type, + WriteType::Unspecified | WriteType::DeleteRange + ); + encode( + SimpleWrite::DeleteRange(DeleteRange { + cf, + start_key, + end_key, + notify_only, + }), + &mut self.buf, + ); + self.write_type = WriteType::DeleteRange; + } + + #[inline] + pub fn ingest(&mut self, sst: Vec) { + debug_assert_matches!(self.write_type, WriteType::Unspecified | WriteType::Ingest); + encode(SimpleWrite::Ingest(sst), &mut self.buf); + self.write_type = WriteType::Ingest; + } + + #[inline] + pub fn encode(self) -> SimpleWriteBinary { + SimpleWriteBinary { + buf: self.buf.into_boxed_slice(), + write_type: self.write_type, + } + } +} + +#[derive(Debug)] +pub struct SimpleWriteReqDecoder<'a> { + header: RaftRequestHeader, + buf: &'a [u8], +} + +impl<'a> SimpleWriteReqDecoder<'a> { + pub fn new( + fallback: impl FnOnce(&'a [u8], u64, u64) -> RaftCmdRequest, + logger: &Logger, + buf: &'a [u8], + index: u64, + term: u64, + ) -> Result, RaftCmdRequest> { + match buf.first().cloned() { + Some(MAGIC_PREFIX) => { + let mut is = CodedInputStream::from_bytes(&buf[1..]); + let header = match is.read_message() { + Ok(h) => h, + Err(e) => slog_panic!( + logger, + "data corrupted"; + "term" => term, + "index" => index, + "error" => ?e + ), + }; + let read = is.pos(); + Ok(SimpleWriteReqDecoder { + header, + buf: &buf[1 + read as usize..], + }) + } + _ => Err(fallback(buf, index, term)), + } + } + + #[inline] + pub fn header(&self) -> &RaftRequestHeader { + &self.header + } + + pub fn to_raft_cmd_request(&self) -> RaftCmdRequest { + let mut req = RaftCmdRequest::default(); + req.set_header(self.header().clone()); + let decoder = Self { + header: Default::default(), + buf: self.buf, + }; + for s in decoder { + match s { + SimpleWrite::Put(Put { cf, key, value }) => { + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + request.mut_put().set_cf(cf.to_owned()); + request.mut_put().set_key(key.to_owned()); + request.mut_put().set_value(value.to_owned()); + req.mut_requests().push(request); + } + SimpleWrite::Delete(Delete { cf, key }) => { + let mut request = Request::default(); + request.set_cmd_type(CmdType::Delete); + request.mut_delete().set_cf(cf.to_owned()); + request.mut_delete().set_key(key.to_owned()); + req.mut_requests().push(request); + } + SimpleWrite::DeleteRange(DeleteRange { + cf, + start_key, + end_key, + notify_only, + }) => { + let mut request = Request::default(); + request.set_cmd_type(CmdType::DeleteRange); + request.mut_delete_range().set_cf(cf.to_owned()); + request + .mut_delete_range() + .set_start_key(start_key.to_owned()); + request.mut_delete_range().set_end_key(end_key.to_owned()); + request.mut_delete_range().set_notify_only(notify_only); + req.mut_requests().push(request); + } + SimpleWrite::Ingest(ssts) => { + for sst in ssts { + let mut request = Request::default(); + request.set_cmd_type(CmdType::IngestSst); + request.mut_ingest_sst().set_sst(sst); + req.mut_requests().push(request); + } + } + } + } + req + } +} + +impl<'a> Iterator for SimpleWriteReqDecoder<'a> { + type Item = SimpleWrite<'a>; + + #[inline] + fn next(&mut self) -> Option { + decode(&mut self.buf) + } +} + +const PUT_TAG: u8 = 0; +const DELETE_TAG: u8 = 1; +const DELETE_RANGE_TAG: u8 = 2; +const INGEST_TAG: u8 = 3; + +const DEFAULT_CF_TAG: u8 = 0; +const WRITE_CF_TAG: u8 = 1; +const LOCK_CF_TAG: u8 = 2; +const ARBITRARY_CF_TAG: u8 = 3; + +// Generally the length of most key is within 128. The length of value is +// within 2GiB. +// The algorithm can be checked in https://www.sqlite.org/src4/doc/trunk/www/varint.wiki. +#[inline] +fn encode_len(len: u32, buf: &mut Vec) { + match len { + 0..=240 => buf.push(len as u8), + 241..=2287 => { + buf.push((241 + (len - 240) / 256) as u8); + buf.push(((len - 240) % 256) as u8); + } + 2288..=67823 => { + buf.push(249); + buf.push(((len - 2288) / 256) as u8); + buf.push(((len - 2288) % 256) as u8); + } + 67824..=16777215 => { + buf.push(250); + let bytes = len.to_be_bytes(); + buf.extend_from_slice(&bytes[1..]); + } + 16777216..=u32::MAX => { + buf.push(251); + let bytes = len.to_be_bytes(); + buf.extend_from_slice(&bytes); + } + } +} + +#[inline] +fn decode_len(buf: &[u8]) -> (u32, &[u8]) { + let (f, left) = buf.split_first().expect("decode len can't be 0"); + match f { + 0..=240 => (*f as u32, left), + 241..=248 => { + let (s, left) = left.split_first().expect("decode len can't be 1"); + (240 + ((*f as u32) - 241) * 256 + *s as u32, left) + } + 249 => { + let (f, left) = left.split_at(2); + (2288 + (f[0] as u32) * 256 + f[1] as u32, left) + } + 250 => { + let (f, left) = left.split_at(3); + (u32::from_be_bytes([0, f[0], f[1], f[2]]), left) + } + 251 => { + let (f, left) = left.split_at(4); + (u32::from_be_bytes([f[0], f[1], f[2], f[3]]), left) + } + _ => panic!("invalid len byte: {}", f), + } +} + +#[inline] +fn encode_bytes(bytes: &[u8], buf: &mut Vec) { + encode_len(bytes.len() as u32, buf); + buf.extend_from_slice(bytes); +} + +#[inline] +fn decode_bytes(buf: &[u8]) -> (&[u8], &[u8]) { + let (len, left) = decode_len(buf); + left.split_at(len as usize) +} + +#[inline] +fn encode_cf(cf: &str, buf: &mut Vec) { + match cf { + CF_DEFAULT => buf.push(DEFAULT_CF_TAG), + CF_LOCK => buf.push(LOCK_CF_TAG), + CF_WRITE => buf.push(WRITE_CF_TAG), + cf => { + // Perhaps should return error. + buf.push(ARBITRARY_CF_TAG); + encode_bytes(cf.as_bytes(), buf); + } + } +} + +#[inline] +fn decode_cf(buf: &[u8]) -> (&str, &[u8]) { + let (cf_tag, left) = buf.split_first().expect("cf cant't empty"); + match *cf_tag { + DEFAULT_CF_TAG => (CF_DEFAULT, left), + LOCK_CF_TAG => (CF_LOCK, left), + WRITE_CF_TAG => (CF_WRITE, left), + ARBITRARY_CF_TAG => { + let (cf, left) = decode_bytes(left); + ( + std::str::from_utf8(cf).expect("cf must be valid utf8"), + left, + ) + } + _ => panic!("invalid cf tag: {}", cf_tag), + } +} + +#[inline(always)] +fn encode(simple_write: SimpleWrite<'_>, buf: &mut Vec) { + match simple_write { + SimpleWrite::Put(put) => { + buf.push(PUT_TAG); + encode_cf(put.cf, buf); + encode_bytes(put.key, buf); + encode_bytes(put.value, buf); + } + SimpleWrite::Delete(delete) => { + buf.push(DELETE_TAG); + encode_cf(delete.cf, buf); + encode_bytes(delete.key, buf); + } + SimpleWrite::DeleteRange(dr) => { + buf.push(DELETE_RANGE_TAG); + encode_cf(dr.cf, buf); + encode_bytes(dr.start_key, buf); + encode_bytes(dr.end_key, buf); + buf.push(dr.notify_only as u8); + } + SimpleWrite::Ingest(ssts) => { + buf.push(INGEST_TAG); + encode_len(ssts.len() as u32, buf); + // IngestSST is not a frequent operation, use protobuf to reduce complexity. + for sst in ssts { + sst.write_length_delimited_to_vec(buf).unwrap(); + } + } + } +} + +#[inline] +fn decode<'a>(buf: &mut &'a [u8]) -> Option> { + let (tag, left) = buf.split_first()?; + match *tag { + PUT_TAG => { + let (cf, left) = decode_cf(left); + let (key, left) = decode_bytes(left); + let (value, left) = decode_bytes(left); + *buf = left; + Some(SimpleWrite::Put(Put { cf, key, value })) + } + DELETE_TAG => { + let (cf, left) = decode_cf(left); + let (key, left) = decode_bytes(left); + *buf = left; + Some(SimpleWrite::Delete(Delete { cf, key })) + } + DELETE_RANGE_TAG => { + let (cf, left) = decode_cf(left); + let (start_key, left) = decode_bytes(left); + let (end_key, left) = decode_bytes(left); + let (notify_only, left) = left.split_first()?; + *buf = left; + Some(SimpleWrite::DeleteRange(DeleteRange { + cf, + start_key, + end_key, + notify_only: *notify_only != 0, + })) + } + INGEST_TAG => { + let (len, left) = decode_len(left); + let mut ssts = Vec::with_capacity(len as usize); + let mut is = CodedInputStream::from_bytes(left); + for _ in 0..len { + let sst = match is.read_message() { + Ok(sst) => sst, + Err(e) => panic!("data corrupted {:?}", e), + }; + ssts.push(sst); + } + let read = is.pos(); + *buf = &left[read as usize..]; + Some(SimpleWrite::Ingest(ssts)) + } + tag => panic!("corrupted data: invalid tag {}", tag), + } +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use kvproto::raft_cmdpb::{CmdType, Request}; + use slog::o; + + use super::*; + use crate::store::Callback; + + fn decoder_fallback(data: &[u8], index: u64, _: u64) -> RaftCmdRequest { + crate::store::util::parse_data_at(data, index, "") + } + + #[test] + fn test_codec() { + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", b""); + let delete_key = vec![0; 1024]; + encoder.delete(CF_WRITE, &delete_key); + let bin = encoder.encode(); + + let mut header = Box::::default(); + header.set_term(2); + let mut req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + bin, + usize::MAX, + false, + ); + + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete_range(CF_LOCK, b"key", b"key", true); + encoder.delete_range("cf", b"key", b"key", false); + let bin = encoder.encode(); + assert!(!req_encoder.amend(&header, &bin)); + let req_encoder2 = SimpleWriteReqEncoder::>::new( + header.clone(), + bin, + 0, + false, + ); + + let (bytes, _) = req_encoder.encode(); + let logger = slog_global::borrow_global().new(o!()); + let mut decoder = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + assert_eq!(*decoder.header(), *header); + let write = decoder.next().unwrap(); + let SimpleWrite::Put(put) = write else { panic!("should be put") }; + assert_eq!(put.cf, CF_DEFAULT); + assert_eq!(put.key, b"key"); + assert_eq!(put.value, b""); + + let write = decoder.next().unwrap(); + let SimpleWrite::Delete(delete) = write else { panic!("should be delete") }; + assert_eq!(delete.cf, CF_WRITE); + assert_eq!(delete.key, &delete_key); + assert_matches!(decoder.next(), None); + + let (bytes, _) = req_encoder2.encode(); + decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + let write = decoder.next().unwrap(); + let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; + assert_eq!(dr.cf, CF_LOCK); + assert_eq!(dr.start_key, b"key"); + assert_eq!(dr.end_key, b"key"); + assert!(dr.notify_only); + + let write = decoder.next().unwrap(); + let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; + assert_eq!(dr.cf, "cf"); + assert_eq!(dr.start_key, b"key"); + assert_eq!(dr.end_key, b"key"); + assert!(!dr.notify_only); + + let res = decoder.next(); + assert!(res.is_none(), "{:?}", res); + + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let exp: Vec<_> = (0..10) + .map(|id| { + let mut meta = SstMeta::default(); + meta.set_region_id(id); + meta + }) + .collect(); + encoder.ingest(exp.clone()); + let bin = encoder.encode(); + let req_encoder = SimpleWriteReqEncoder::>::new( + header, bin, 0, false, + ); + let (bytes, _) = req_encoder.encode(); + let mut decoder = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + let write = decoder.next().unwrap(); + let SimpleWrite::Ingest(ssts) = write else { panic!("should be ingest") }; + assert_eq!(exp, ssts); + assert_matches!(decoder.next(), None); + } + + #[test] + fn test_encode_num() { + let mut buf = Vec::new(); + let cases = vec![ + 0, + 1, + 240, + 241, + 2287, + 2288, + 67823, + 67824, + 16777215, + 16777216, + u32::MAX, + ]; + for n in cases { + super::encode_len(n, &mut buf); + buf.push(0); + let (m, left) = super::decode_len(&buf); + assert_eq!(n, m); + assert_eq!(left, &[0]); + buf.clear(); + } + } + + #[test] + fn test_invalid() { + let mut raft_cmd = RaftCmdRequest::default(); + raft_cmd.mut_header().set_term(2); + + let mut req = Request::default(); + req.set_cmd_type(CmdType::Invalid); + raft_cmd.mut_requests().push(req); + let bytes = raft_cmd.write_to_bytes().unwrap(); + let logger = slog_global::borrow_global().new(o!()); + let decoded = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap_err(); + // SimpleWriteReqDecoder should be able to decode naive RaftCmdRequest. + assert_eq!(decoded, raft_cmd); + + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", b""); + let bin = encoder.encode(); + + let mut header = Box::::default(); + header.set_term(2); + let mut req_encoder: SimpleWriteReqEncoder> = + SimpleWriteReqEncoder::>::new( + header.clone(), + bin.clone(), + 512, + false, + ); + + let mut header2 = Box::::default(); + header2.set_term(4); + // Only simple write command with same header can be batched. + assert!(!req_encoder.amend(&header2, &bin)); + + let mut bin2 = bin.clone(); + bin2.freeze(); + // Frozen bin can't be merged with other bin. + assert!(!req_encoder.amend(&header, &bin2)); + let mut req_encoder2: SimpleWriteReqEncoder> = + SimpleWriteReqEncoder::>::new( + header.clone(), + bin2.clone(), + 512, + false, + ); + assert!(!req_encoder2.amend(&header, &bin)); + + // Batch should not excceed max size limit. + let large_value = vec![0; 512]; + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", &large_value); + assert!(!req_encoder.amend(&header, &encoder.encode())); + + let (bytes, _) = req_encoder.encode(); + let mut decoder = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + assert_eq!(*decoder.header(), *header); + let req = decoder.next().unwrap(); + let SimpleWrite::Put(put) = req else { panic!("should be put") }; + assert_eq!(put.cf, CF_DEFAULT); + assert_eq!(put.key, b"key"); + assert_eq!(put.value, b""); + + let res = decoder.next(); + assert!(res.is_none(), "{:?}", res); + } + + #[test] + fn test_to_raft_cmd_request() { + let logger = slog_global::borrow_global().new(o!()); + + // Test header. + let mut header = Box::::default(); + header.set_term(2); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + SimpleWriteEncoder::with_capacity(512).encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + assert_eq!( + header.as_ref(), + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request() + .get_header(), + ); + + // Test put. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_WRITE, b"write", b"value"); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 1); + assert_eq!(req.get_requests()[0].get_put().get_cf(), CF_WRITE); + assert_eq!(req.get_requests()[0].get_put().get_key(), b"write"); + assert_eq!(req.get_requests()[0].get_put().get_value(), b"value"); + + // Test delete. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete(CF_DEFAULT, b"write"); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 1); + assert_eq!(req.get_requests()[0].get_delete().get_cf(), CF_DEFAULT); + assert_eq!(req.get_requests()[0].get_delete().get_key(), b"write"); + + // Test delete range. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete_range(CF_LOCK, b"start", b"end", true); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 1); + assert_eq!(req.get_requests()[0].get_delete_range().get_cf(), CF_LOCK); + assert_eq!( + req.get_requests()[0].get_delete_range().get_start_key(), + b"start" + ); + assert_eq!( + req.get_requests()[0].get_delete_range().get_end_key(), + b"end" + ); + assert_eq!( + req.get_requests()[0].get_delete_range().get_notify_only(), + true + ); + + // Test ingest. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.ingest(vec![SstMeta::default(); 5]); + let req_encoder = SimpleWriteReqEncoder::>::new( + header, + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 5); + assert!(req.get_requests()[0].has_ingest_sst()); + assert!(req.get_requests()[4].has_ingest_sst()); + } +} diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index a39cda850fa..62744501195 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -9,26 +9,25 @@ use std::{ result, str, sync::{ atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, - Arc, RwLock, + Arc, Mutex, RwLock, }, thread, time, u64, }; use collections::{HashMap, HashMapEntry as Entry}; -use encryption::{ - create_aes_ctr_crypter, encryption_method_from_db_encryption_method, DataKeyManager, Iv, -}; +use encryption::{create_aes_ctr_crypter, from_engine_encryption_method, DataKeyManager, Iv}; use engine_traits::{CfName, EncryptionKeyManager, KvEngine, CF_DEFAULT, CF_LOCK, CF_WRITE}; use error_code::{self, ErrorCode, ErrorCodeExt}; use fail::fail_point; use file_system::{ - calc_crc32, calc_crc32_and_size, delete_file_if_exist, file_exists, get_file_size, sync_dir, - File, Metadata, OpenOptions, + calc_crc32, calc_crc32_and_size, delete_dir_if_exist, delete_file_if_exist, file_exists, + get_file_size, sync_dir, File, Metadata, OpenOptions, }; use keys::{enc_end_key, enc_start_key}; use kvproto::{ encryptionpb::EncryptionMethod, metapb::Region, + pdpb::SnapshotStat, raft_serverpb::{RaftSnapshotData, SnapshotCfFile, SnapshotMeta}, }; use openssl::symm::{Cipher, Crypter, Mode}; @@ -37,19 +36,13 @@ use raft::eraftpb::Snapshot as RaftSnapshot; use thiserror::Error; use tikv_util::{ box_err, box_try, debug, error, info, - time::{duration_to_sec, Instant, Limiter}, + time::{duration_to_sec, Instant, Limiter, UnixSecs}, warn, HandyRwLock, }; use crate::{ coprocessor::CoprocessorHost, - store::{ - metrics::{ - CfNames, INGEST_SST_DURATION_SECONDS, SNAPSHOT_BUILD_TIME_HISTOGRAM, - SNAPSHOT_CF_KV_COUNT, SNAPSHOT_CF_SIZE, - }, - peer_storage::JOB_STATUS_CANCELLING, - }, + store::{metrics::*, peer_storage::JOB_STATUS_CANCELLING}, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -64,6 +57,7 @@ pub const SNAPSHOT_CFS_ENUM_PAIR: &[(CfNames, CfName)] = &[ (CfNames::write, CF_WRITE), ]; pub const SNAPSHOT_VERSION: u64 = 2; +pub const TABLET_SNAPSHOT_VERSION: u64 = 3; pub const IO_LIMITER_CHUNK_SIZE: usize = 4 * 1024; /// Name prefix for the self-generated snapshot file. @@ -152,7 +146,6 @@ impl SnapKey { if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { return Err(io::Error::new(ErrorKind::Other, e)); } - Ok(SnapKey::from_region_snap( snap_data.get_region().get_id(), snap, @@ -213,7 +206,9 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile]) -> RaftStoreResult { +// Create a SnapshotMeta that can be later put into RaftSnapshotData or written +// into file. +pub fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -241,6 +236,7 @@ fn gen_snapshot_meta(cf_files: &[CfFile]) -> RaftStoreResult { } let mut snapshot_meta = SnapshotMeta::default(); snapshot_meta.set_cf_files(meta.into()); + snapshot_meta.set_for_balance(for_balance); Ok(snapshot_meta) } @@ -371,7 +367,8 @@ impl CfFile { assert!(self.size.len() >= idx); let file_name = self.gen_file_name(idx); if self.size.len() > idx { - // Any logic similar to test_snap_corruption_on_size_or_checksum will trigger this branch + // Any logic similar to test_snap_corruption_on_size_or_checksum will trigger + // this branch self.size[idx] = size; self.checksum[idx] = checksum; self.file_names[idx] = file_name.clone(); @@ -425,7 +422,7 @@ impl CfFile { #[derive(Default)] struct MetaFile { - pub meta: SnapshotMeta, + pub meta: Option, pub path: PathBuf, pub file: Option, @@ -446,7 +443,7 @@ pub struct Snapshot { mgr: SnapManagerCore, } -#[derive(PartialEq, Eq, Clone, Copy)] +#[derive(PartialEq, Clone, Copy)] enum CheckPolicy { ErrAllowed, ErrNotAllowed, @@ -563,7 +560,7 @@ impl Snapshot { for (i, file_path) in file_paths.iter().enumerate() { if cf_file.size[i] > 0 { let path = Path::new(file_path); - let file = File::open(&path)?; + let file = File::open(path)?; cf_file .file_for_sending .push(Box::new(file) as Box); @@ -606,7 +603,7 @@ impl Snapshot { let f = OpenOptions::new() .write(true) .create_new(true) - .open(&file_path)?; + .open(file_path)?; cf_file.file_for_recving.push(CfFileForRecving { file: f, encrypter: None, @@ -616,7 +613,7 @@ impl Snapshot { if let Some(mgr) = &s.mgr.encryption_key_manager { let enc_info = mgr.new_file(&file_paths[idx])?; - let mthd = encryption_method_from_db_encryption_method(enc_info.method); + let mthd = from_engine_encryption_method(enc_info.method); if mthd != EncryptionMethod::Plaintext { let file_for_recving = cf_file.file_for_recving.last_mut().unwrap(); file_for_recving.encrypter = Some( @@ -645,8 +642,8 @@ impl Snapshot { Ok(s) } - // If all files of the snapshot exist, return `Ok` directly. Otherwise create a new file at - // the temporary meta file path, so that all other try will fail. + // If all files of the snapshot exist, return `Ok` directly. Otherwise create a + // new file at the temporary meta file path, so that all other try will fail. fn init_for_building(&mut self) -> RaftStoreResult<()> { if self.exists() { return Ok(()); @@ -667,7 +664,8 @@ impl Snapshot { Ok(snapshot_meta) } - fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { + // Validate and set SnapshotMeta of this Snapshot. + pub fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { let mut cf_file_count_from_meta: Vec = vec![]; let mut file_count = 0; let mut current_cf = ""; @@ -736,7 +734,7 @@ impl Snapshot { } } } - self.meta_file.meta = snapshot_meta; + self.meta_file.meta = Some(snapshot_meta); Ok(()) } @@ -755,7 +753,7 @@ impl Snapshot { } pub fn load_snapshot_meta_if_necessary(&mut self) -> RaftStoreResult<()> { - if self.meta_file.meta.get_cf_files().is_empty() && file_exists(&self.meta_file.path) { + if self.meta_file.meta.is_none() && file_exists(&self.meta_file.path) { return self.load_snapshot_meta(); } Ok(()) @@ -794,7 +792,7 @@ impl Snapshot { if !for_send && !plain_file_used(cf_file.cf) { sst_importer::prepare_sst_for_ingestion( file_path, - &Path::new(&clone_file_paths[i]), + Path::new(&clone_file_paths[i]), self.mgr.encryption_key_manager.as_deref(), )?; } @@ -816,14 +814,15 @@ impl Snapshot { } } - // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { - let v = box_try!(self.meta_file.meta.write_to_bytes()); + // Save `SnapshotMeta` to file. + // Used in `do_build` and by external crates. + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { + let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { - // `meta_file` could be None for this case: in `init_for_building` the snapshot exists - // so no temporary meta file is created, and this field is None. However in `do_build` - // it's deleted so we build it again, and then call `save_meta_file` with `meta_file` - // as None. + // `meta_file` could be None for this case: in `init_for_building` the snapshot + // exists so no temporary meta file is created, and this field is + // None. However in `do_build` it's deleted so we build it again, + // and then call `save_meta_file` with `meta_file` as None. // FIXME: We can fix it later by introducing a better snapshot delete mechanism. f.write_all(&v[..])?; f.flush()?; @@ -844,8 +843,8 @@ impl Snapshot { engine: &EK, kv_snap: &EK::Snapshot, region: &Region, - stat: &mut SnapshotStatistics, allow_multi_files_snapshot: bool, + for_balance: bool, ) -> RaftStoreResult<()> where EK: KvEngine, @@ -893,10 +892,11 @@ impl Snapshot { &self.mgr.limiter, )? }; + SNAPSHOT_LIMIT_GENERATE_BYTES.inc_by(cf_stat.total_size as u64); cf_file.kv_count = cf_stat.key_count as u64; if cf_file.kv_count > 0 { - // Use `kv_count` instead of file size to check empty files because encrypted sst files - // contain some metadata so their sizes will never be 0. + // Use `kv_count` instead of file size to check empty files because encrypted + // sst files contain some metadata so their sizes will never be 0. self.mgr.rename_tmp_cf_file_for_send(cf_file)?; } else { for tmp_file_path in cf_file.tmp_file_paths() { @@ -926,17 +926,15 @@ impl Snapshot { ); } - stat.kv_count = self.cf_files.iter().map(|cf| cf.kv_count as usize).sum(); // save snapshot meta to meta file - let snapshot_meta = gen_snapshot_meta(&self.cf_files[..])?; - self.meta_file.meta = snapshot_meta; + self.meta_file.meta = Some(gen_snapshot_meta(&self.cf_files[..], for_balance)?); self.save_meta_file()?; Ok(()) } fn delete(&self) { macro_rules! try_delete_snapshot_files { - ($cf_file: ident, $file_name_func: ident) => { + ($cf_file:ident, $file_name_func:ident) => { let mut file_id = 0; loop { let file_path = $cf_file.path.join($cf_file.$file_name_func(file_id)); @@ -948,7 +946,7 @@ impl Snapshot { } } }; - ($cf_file: ident) => { + ($cf_file:ident) => { let mut file_id = 0; loop { let file_path = $cf_file.path.join($cf_file.gen_file_name(file_id)); @@ -972,13 +970,14 @@ impl Snapshot { for cf_file in &self.cf_files { // Delete cloned files. let clone_file_paths = cf_file.clone_file_paths(); - // in case the meta file is corrupted or deleted, delete snapshot files with best effort + // in case the meta file is corrupted or deleted, delete snapshot files with + // best effort if clone_file_paths.is_empty() { try_delete_snapshot_files!(cf_file, gen_clone_file_name); } else { // delete snapshot files according to meta file for clone_file_path in clone_file_paths { - delete_file_if_exist(&clone_file_path).unwrap(); + delete_file_if_exist(clone_file_path).unwrap(); } } @@ -989,7 +988,7 @@ impl Snapshot { try_delete_snapshot_files!(cf_file, gen_tmp_file_name); } else { for tmp_file_path in tmp_file_paths { - delete_file_if_exist(&tmp_file_path).unwrap(); + delete_file_if_exist(tmp_file_path).unwrap(); } } } @@ -1000,7 +999,7 @@ impl Snapshot { try_delete_snapshot_files!(cf_file); } else { for file_path in &file_paths { - delete_file_if_exist(&file_path).unwrap(); + delete_file_if_exist(file_path).unwrap(); } if let Some(ref mgr) = self.mgr.encryption_key_manager { for file_path in &file_paths { @@ -1009,11 +1008,42 @@ impl Snapshot { } } } + if let Some(ref meta) = self.meta_file.meta { + if !meta.tablet_snap_path.is_empty() { + delete_dir_if_exist(&meta.tablet_snap_path).unwrap(); + } + } delete_file_if_exist(&self.meta_file.path).unwrap(); if self.hold_tmp_files { delete_file_if_exist(&self.meta_file.tmp_path).unwrap(); } } + + // This is only used for v2 compatibility. + fn new_for_tablet_snapshot>( + dir: T, + key: &SnapKey, + mgr: &SnapManagerCore, + tablet_snapshot_path: &str, + for_balance: bool, + ) -> RaftStoreResult { + let mut s = Self::new(dir, key, false, CheckPolicy::ErrNotAllowed, mgr)?; + s.init_for_building()?; + let mut meta = gen_snapshot_meta(&s.cf_files[..], for_balance)?; + meta.tablet_snap_path = tablet_snapshot_path.to_string(); + s.meta_file.meta = Some(meta); + s.save_meta_file()?; + Ok(s) + } + + #[cfg(any(test, feature = "testexport"))] + pub fn tablet_snap_path(&self) -> Option { + Some(self.meta_file.meta.as_ref()?.tablet_snap_path.clone()) + } + + pub fn snapshot_meta(&self) -> &Option { + &self.meta_file.meta + } } impl fmt::Debug for Snapshot { @@ -1031,31 +1061,45 @@ impl Snapshot { engine: &EK, kv_snap: &EK::Snapshot, region: &Region, - snap_data: &mut RaftSnapshotData, - stat: &mut SnapshotStatistics, allow_multi_files_snapshot: bool, - ) -> RaftStoreResult<()> { + for_balance: bool, + start: UnixSecs, + ) -> RaftStoreResult { + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region.clone()); + let t = Instant::now(); - self.do_build::(engine, kv_snap, region, stat, allow_multi_files_snapshot)?; + self.do_build::( + engine, + kv_snap, + region, + allow_multi_files_snapshot, + for_balance, + )?; - let total_size = self.total_size()?; - stat.size = total_size; + let total_size = self.total_size(); + let total_count = self.total_count(); // set snapshot meta data snap_data.set_file_size(total_size); snap_data.set_version(SNAPSHOT_VERSION); - snap_data.set_meta(self.meta_file.meta.clone()); - - SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed()) as f64); + let meta = self.meta_file.meta.as_mut().unwrap(); + meta.set_start(start.into_inner()); + meta.set_generate_duration_sec(t.saturating_elapsed().as_secs()); + snap_data.set_meta(meta.clone()); + + SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed())); + SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_count as f64); + SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); info!( "scan snapshot"; "region_id" => region.get_id(), "snapshot" => self.path(), - "key_count" => stat.kv_count, + "key_count" => total_count, "size" => total_size, "takes" => ?t.saturating_elapsed(), ); - Ok(()) + Ok(snap_data) } pub fn apply(&mut self, options: ApplyOptions) -> Result<()> { @@ -1111,7 +1155,7 @@ impl Snapshot { || (cf_file .file_paths() .iter() - .all(|file_path| file_exists(&Path::new(file_path)))) + .all(|file_path| file_exists(Path::new(file_path)))) }) && file_exists(&self.meta_file.path) } @@ -1119,11 +1163,19 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } - pub fn total_size(&self) -> io::Result { - Ok(self - .cf_files + pub fn meta_path(&self) -> &PathBuf { + &self.meta_file.path + } + + pub fn total_size(&self) -> u64 { + self.cf_files .iter() - .fold(0, |acc, x| acc + x.size.iter().sum::())) + .map(|cf| cf.size.iter().sum::()) + .sum() + } + + pub fn total_count(&self) -> u64 { + self.cf_files.iter().map(|cf| cf.kv_count).sum() } pub fn save(&mut self) -> io::Result<()> { @@ -1176,13 +1228,13 @@ impl Snapshot { let tmp_paths = cf_file.tmp_file_paths(); let paths = cf_file.file_paths(); for (i, tmp_path) in tmp_paths.iter().enumerate() { - file_system::rename(&tmp_path, &paths[i])?; + file_system::rename(tmp_path, &paths[i])?; } } sync_dir(&self.dir_path)?; // write meta file - let v = self.meta_file.meta.write_to_bytes()?; + let v = self.meta_file.meta.as_ref().unwrap().write_to_bytes()?; { let mut meta_file = self.meta_file.file.take().unwrap(); meta_file.write_all(&v[..])?; @@ -1193,6 +1245,10 @@ impl Snapshot { self.hold_tmp_files = false; Ok(()) } + + pub fn cf_files(&self) -> &[CfFile] { + &self.cf_files + } } // To check whether a procedure about apply snapshot aborts or not. @@ -1341,6 +1397,7 @@ pub enum SnapEntry { pub struct SnapStats { pub sending_count: usize, pub receiving_count: usize, + pub stats: Vec, } #[derive(Clone)] @@ -1354,12 +1411,16 @@ struct SnapManagerCore { encryption_key_manager: Option>, max_per_file_size: Arc, enable_multi_snapshot_files: Arc, + stats: Arc>>, } /// `SnapManagerCore` trace all current processing snapshots. pub struct SnapManager { core: SnapManagerCore, max_total_size: Arc, + + // only used to receive snapshot from v2 + tablet_snap_manager: Option, } impl Clone for SnapManager { @@ -1367,6 +1428,7 @@ impl Clone for SnapManager { SnapManager { core: self.core.clone(), max_total_size: self.max_total_size.clone(), + tablet_snap_manager: self.tablet_snap_manager.clone(), } } } @@ -1406,11 +1468,12 @@ impl SnapManager { } } } + Ok(()) } - // [PerformanceCriticalPath]?? I/O involved API should be called in background thread - // Return all snapshots which is idle not being used. + // [PerformanceCriticalPath]?? I/O involved API should be called in background + // thread Return all snapshots which is idle not being used. pub fn list_idle_snap(&self) -> io::Result> { // Use a lock to protect the directory when scanning. let registry = self.core.registry.rl(); @@ -1476,7 +1539,7 @@ impl SnapManager { "{}_{}{}{}", DEL_RANGE_PREFIX, sst_id, SST_FILE_SUFFIX, TMP_FILE_SUFFIX ); - let path = PathBuf::from(&self.core.base).join(&filename); + let path = PathBuf::from(&self.core.base).join(filename); path.to_str().unwrap().to_string() } @@ -1489,7 +1552,8 @@ impl SnapManager { /// because only one caller can lock temporary disk files. /// /// NOTE: it calculates snapshot size by scanning the base directory. - /// Don't call it in raftstore thread until the size limitation mechanism gets refactored. + /// Don't call it in raftstore thread until the size limitation mechanism + /// gets refactored. pub fn get_snapshot_for_building(&self, key: &SnapKey) -> RaftStoreResult> { let mut old_snaps = None; while self.get_total_snap_size()? > self.max_total_snap_size() { @@ -1559,21 +1623,52 @@ impl SnapManager { Ok(Box::new(s)) } - /// Get a `Snapshot` can be used for writting and then `save`. Concurrent calls - /// are allowed because only one caller can lock temporary disk files. + /// Get a `Snapshot` can be used for writing and then `save`. Concurrent + /// calls are allowed because only one caller can lock temporary disk + /// files. pub fn get_snapshot_for_receiving( &self, key: &SnapKey, - data: &[u8], + snapshot_meta: SnapshotMeta, ) -> RaftStoreResult> { let _lock = self.core.registry.rl(); - let mut snapshot_data = RaftSnapshotData::default(); - snapshot_data.merge_from_bytes(data)?; let base = &self.core.base; - let f = Snapshot::new_for_receiving(base, key, &self.core, snapshot_data.take_meta())?; + let f = Snapshot::new_for_receiving(base, key, &self.core, snapshot_meta)?; Ok(Box::new(f)) } + // Tablet snapshot is the snapshot sent from raftstore-v2. + // We enable v1 to receive it to enable tiflash node to receive and apply + // snapshot from raftstore-v2. + // To make it easy, we maintain an empty `store::snapshot` with tablet snapshot + // path storing in it. So tiflash node can detect it and apply properly. + pub fn gen_empty_snapshot_for_tablet_snapshot( + &self, + tablet_snap_key: &TabletSnapKey, + for_balance: bool, + ) -> RaftStoreResult<()> { + let _lock = self.core.registry.rl(); + let base = &self.core.base; + let tablet_snap_path = self + .tablet_snap_manager + .as_ref() + .unwrap() + .final_recv_path(tablet_snap_key); + let snap_key = SnapKey::new( + tablet_snap_key.region_id, + tablet_snap_key.term, + tablet_snap_key.idx, + ); + let _ = Snapshot::new_for_tablet_snapshot( + base, + &snap_key, + &self.core, + tablet_snap_path.to_str().unwrap(), + for_balance, + )?; + Ok(()) + } + pub fn get_snapshot_for_applying(&self, key: &SnapKey) -> RaftStoreResult> { let _lock = self.core.registry.rl(); let base = &self.core.base; @@ -1593,7 +1688,13 @@ impl SnapManager { /// /// NOTE: don't call it in raftstore thread. pub fn get_total_snap_size(&self) -> Result { - self.core.get_total_snap_size() + let size_v1 = self.core.get_total_snap_size()?; + let size_v2 = self + .tablet_snap_manager + .as_ref() + .map(|s| s.total_snap_size().unwrap_or(0)) + .unwrap_or(0); + Ok(size_v1 + size_v2) } pub fn max_total_snap_size(&self) -> u64 { @@ -1635,6 +1736,18 @@ impl SnapManager { self.core.limiter.speed_limit() } + pub fn collect_stat(&self, snap: SnapshotStat) { + debug!( + "collect snapshot stat"; + "region_id" => snap.region_id, + "total_size" => snap.get_transport_size(), + "total_duration_sec" => snap.get_total_duration_sec(), + "generate_duration_sec" => snap.get_generate_duration_sec(), + "send_duration_sec" => snap.get_generate_duration_sec(), + ); + self.core.stats.lock().unwrap().push(snap); + } + pub fn register(&self, key: SnapKey, entry: SnapEntry) { debug!( "register snapshot"; @@ -1705,15 +1818,25 @@ impl SnapManager { } } + let stats = std::mem::take(self.core.stats.lock().unwrap().as_mut()); SnapStats { sending_count: sending_cnt, receiving_count: receiving_cnt, + stats, } } pub fn delete_snapshot(&self, key: &SnapKey, snap: &Snapshot, check_entry: bool) -> bool { self.core.delete_snapshot(key, snap, check_entry) } + + pub fn tablet_snap_manager(&self) -> Option<&TabletSnapManager> { + self.tablet_snap_manager.as_ref() + } + + pub fn limiter(&self) -> &Limiter { + &self.core.limiter + } } impl SnapManagerCore { @@ -1769,8 +1892,6 @@ impl SnapManagerCore { let tmp_file_paths = cf_file.tmp_file_paths(); let file_paths = cf_file.file_paths(); for (i, tmp_file_path) in tmp_file_paths.iter().enumerate() { - file_system::rename(&tmp_file_path, &file_paths[i])?; - let mgr = self.encryption_key_manager.as_ref(); if let Some(mgr) = &mgr { let src = &tmp_file_path; @@ -1784,7 +1905,15 @@ impl SnapManagerCore { } return Err(e.into()); } - mgr.delete_file(src)?; + let r = file_system::rename(src, dst); + let del_file = if r.is_ok() { src } else { dst }; + if let Err(e) = mgr.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'rename_tmp_cf_file_for_send'"; + "err" => ?e); + } + r?; + } else { + file_system::rename(tmp_file_path, &file_paths[i])?; } let file = Path::new(&file_paths[i]); let (checksum, size) = calc_checksum_and_size(file, mgr)?; @@ -1811,6 +1940,7 @@ pub struct SnapManagerBuilder { max_total_size: u64, max_per_file_size: u64, enable_multi_snapshot_files: bool, + enable_receive_tablet_snapshot: bool, key_manager: Option>, } @@ -1833,6 +1963,10 @@ impl SnapManagerBuilder { self.enable_multi_snapshot_files = enabled; self } + pub fn enable_receive_tablet_snapshot(mut self, enabled: bool) -> SnapManagerBuilder { + self.enable_receive_tablet_snapshot = enabled; + self + } #[must_use] pub fn encryption_key_manager(mut self, m: Option>) -> SnapManagerBuilder { self.key_manager = m; @@ -1849,9 +1983,19 @@ impl SnapManagerBuilder { } else { u64::MAX }; + let path = path.into(); + assert!(!path.is_empty()); + let mut path_v2 = path.clone(); + path_v2.push_str("_v2"); + let tablet_snap_manager = if self.enable_receive_tablet_snapshot { + Some(TabletSnapManager::new(&path_v2, self.key_manager.clone()).unwrap()) + } else { + None + }; + let mut snapshot = SnapManager { core: SnapManagerCore { - base: path.into(), + base: path, registry: Default::default(), limiter, temp_sst_id: Arc::new(AtomicU64::new(0)), @@ -1860,18 +2004,242 @@ impl SnapManagerBuilder { enable_multi_snapshot_files: Arc::new(AtomicBool::new( self.enable_multi_snapshot_files, )), + stats: Default::default(), }, max_total_size: Arc::new(AtomicU64::new(max_total_size)), + tablet_snap_manager, }; snapshot.set_max_per_file_size(self.max_per_file_size); // set actual max_per_file_size snapshot } } +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct TabletSnapKey { + pub region_id: u64, + pub to_peer: u64, + pub term: u64, + pub idx: u64, +} + +impl TabletSnapKey { + #[inline] + pub fn new(region_id: u64, to_peer: u64, term: u64, idx: u64) -> TabletSnapKey { + TabletSnapKey { + region_id, + to_peer, + term, + idx, + } + } + + pub fn from_region_snap(region_id: u64, to_peer: u64, snap: &RaftSnapshot) -> TabletSnapKey { + let index = snap.get_metadata().get_index(); + let term = snap.get_metadata().get_term(); + TabletSnapKey::new(region_id, to_peer, term, index) + } +} + +impl Display for TabletSnapKey { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}_{}", + self.region_id, self.to_peer, self.term, self.idx + ) + } +} + +pub struct ReceivingGuard<'a> { + receiving: &'a Mutex>, + key: TabletSnapKey, +} + +impl Drop for ReceivingGuard<'_> { + fn drop(&mut self) { + let mut receiving = self.receiving.lock().unwrap(); + let pos = receiving.iter().position(|k| k == &self.key).unwrap(); + receiving.swap_remove(pos); + } +} + +/// `TabletSnapManager` manager tablet snapshot and shared between raftstore v2. +/// It's similar `SnapManager`, but simpler in tablet version. +/// +/// TODO: +/// - clean up expired tablet checkpointer +#[derive(Clone)] +pub struct TabletSnapManager { + // directory to store snapfile. + base: PathBuf, + key_manager: Option>, + receiving: Arc>>, + stats: Arc>>, + sending_count: Arc, + recving_count: Arc, +} + +impl TabletSnapManager { + pub fn new>( + path: T, + key_manager: Option>, + ) -> io::Result { + let path = path.into(); + if !path.exists() { + file_system::create_dir_all(&path)?; + } + if !path.is_dir() { + return Err(io::Error::new( + ErrorKind::Other, + format!("{} should be a directory", path.display()), + )); + } + encryption::clean_up_dir(&path, SNAP_GEN_PREFIX, key_manager.as_deref())?; + encryption::clean_up_trash(&path, key_manager.as_deref())?; + Ok(Self { + base: path, + key_manager, + receiving: Arc::default(), + stats: Arc::default(), + sending_count: Arc::default(), + recving_count: Arc::default(), + }) + } + + pub fn begin_snapshot(&self, key: TabletSnapKey, start: Instant, generate_duration_sec: u64) { + let mut stat = SnapshotStat::default(); + stat.set_generate_duration_sec(generate_duration_sec); + self.stats.lock().unwrap().insert(key, (start, stat)); + } + + pub fn finish_snapshot(&self, key: TabletSnapKey, send: Instant) { + let region_id = key.region_id; + self.stats + .lock() + .unwrap() + .entry(key) + .and_modify(|(start, stat)| { + stat.set_send_duration_sec(send.saturating_elapsed().as_secs()); + stat.set_total_duration_sec(start.saturating_elapsed().as_secs()); + stat.set_region_id(region_id); + }); + } + + pub fn stats(&self) -> SnapStats { + let stats: Vec = self + .stats + .lock() + .unwrap() + .drain_filter(|_, (_, stat)| stat.get_region_id() > 0) + .map(|(_, (_, stat))| stat) + .filter(|stat| stat.get_total_duration_sec() > 1) + .collect(); + SnapStats { + sending_count: self.sending_count.load(Ordering::SeqCst), + receiving_count: self.recving_count.load(Ordering::SeqCst), + stats, + } + } + + pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); + PathBuf::from(&self.base).join(prefix) + } + + pub fn final_recv_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}", SNAP_REV_PREFIX, key); + PathBuf::from(&self.base).join(prefix) + } + + pub fn tmp_recv_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}{}", SNAP_REV_PREFIX, key, TMP_FILE_SUFFIX); + PathBuf::from(&self.base).join(prefix) + } + + pub fn delete_snapshot(&self, key: &TabletSnapKey) -> bool { + let path = self.tablet_gen_path(key); + if path.exists() { + if let Err(e) = encryption::trash_dir_all(&path, self.key_manager.as_deref()) { + error!( + "delete snapshot failed"; + "path" => %path.display(), + "err" => ?e, + ); + return false; + } + } + true + } + + pub fn total_snap_size(&self) -> Result { + let mut total_size = 0; + for entry in file_system::read_dir(&self.base)? { + let entry = match entry { + Ok(e) => e, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + + let path = entry.path(); + // Generated snapshots are just checkpoints, only counts received snapshots. + if !path + .file_name() + .and_then(|n| n.to_str()) + .map_or(true, |n| n.starts_with(SNAP_REV_PREFIX)) + { + continue; + } + let entries = match file_system::read_dir(path) { + Ok(entries) => entries, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + for e in entries { + match e.and_then(|e| e.metadata()) { + Ok(m) => total_size += m.len(), + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + } + } + } + Ok(total_size) + } + + #[inline] + pub fn root_path(&self) -> &Path { + self.base.as_path() + } + + pub fn start_receive(&self, key: TabletSnapKey) -> Option> { + let mut receiving = self.receiving.lock().unwrap(); + if receiving.iter().any(|k| k == &key) { + return None; + } + receiving.push(key.clone()); + Some(ReceivingGuard { + receiving: &self.receiving, + key, + }) + } + + pub fn sending_count(&self) -> &Arc { + &self.sending_count + } + + pub fn recving_count(&self) -> &Arc { + &self.recving_count + } + + #[inline] + pub fn key_manager(&self) -> &Option> { + &self.key_manager + } +} + #[cfg(test)] pub mod tests { use std::{ - cmp, + cmp, fs, io::{self, Read, Seek, SeekFrom, Write}, path::{Path, PathBuf}, sync::{ @@ -1883,18 +2251,19 @@ pub mod tests { use encryption::{DataKeyManager, EncryptionConfig, FileConfig, MasterKeyConfig}; use encryption_export::data_key_manager_from_config; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions}, + ctor::{CfOptions, DbOptions, KvEngineConstructorExt, RaftDbOptions}, kv::KvTestEngine, raft::RaftTestEngine, }; use engine_traits::{ - Engines, ExternalSstFileInfo, KvEngine, RaftEngine, Snapshot as EngineSnapshot, SstExt, - SstWriter, SstWriterBuilder, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + Engines, ExternalSstFileInfo, KvEngine, RaftEngine, RaftLogBatch, + Snapshot as EngineSnapshot, SstExt, SstWriter, SstWriterBuilder, SyncMutable, ALL_CFS, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ encryptionpb::EncryptionMethod, metapb::{Peer, Region}, - raft_serverpb::{RaftApplyState, RaftSnapshotData, RegionLocalState, SnapshotMeta}, + raft_serverpb::{RaftApplyState, RegionLocalState, SnapshotMeta}, }; use protobuf::Message; use raft::eraftpb::Entry; @@ -1917,32 +2286,41 @@ pub mod tests { const TEST_META_FILE_BUFFER_SIZE: usize = 1000; const BYTE_SIZE: usize = 1; - type DBBuilder = - fn(p: &Path, db_opt: Option, cf_opts: Option>>) -> Result; + type DbBuilder = fn( + p: &Path, + db_opt: Option, + cf_opts: Option>, + ) -> Result; pub fn open_test_empty_db( path: &Path, - db_opt: Option, - cf_opts: Option>>, + db_opt: Option, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, { let p = path.to_str().unwrap(); - let db = E::new_kv_engine(p, db_opt, ALL_CFS, cf_opts).unwrap(); + let db_opt = db_opt.unwrap_or_default(); + let cf_opts = cf_opts.unwrap_or_else(|| { + ALL_CFS + .iter() + .map(|cf| (*cf, CfOptions::default())) + .collect() + }); + let db = E::new_kv_engine_opt(p, db_opt, cf_opts).unwrap(); Ok(db) } pub fn open_test_db( path: &Path, - db_opt: Option, - cf_opts: Option>>, + db_opt: Option, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, { - let p = path.to_str().unwrap(); - let db = E::new_kv_engine(p, db_opt, ALL_CFS, cf_opts).unwrap(); + let db = open_test_empty_db::(path, db_opt, cf_opts).unwrap(); let key = keys::data_key(TEST_KEY); // write some data into each cf for (i, cf) in db.cf_names().into_iter().enumerate() { @@ -1956,14 +2334,13 @@ pub mod tests { pub fn open_test_db_with_100keys( path: &Path, - db_opt: Option, - cf_opts: Option>>, + db_opt: Option, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, { - let p = path.to_str().unwrap(); - let db = E::new_kv_engine(p, db_opt, ALL_CFS, cf_opts).unwrap(); + let db = open_test_empty_db::(path, db_opt, cf_opts).unwrap(); // write some data into each cf for (i, cf) in db.cf_names().into_iter().enumerate() { let mut p = Peer::default(); @@ -1979,15 +2356,16 @@ pub mod tests { pub fn get_test_db_for_regions( path: &TempDir, - raft_db_opt: Option, - kv_db_opt: Option, - kv_cf_opts: Option>>, + raft_db_opt: Option, + kv_db_opt: Option, + kv_cf_opts: Option>, regions: &[u64], ) -> Result> { let p = path.path(); let kv: KvTestEngine = open_test_db(p.join("kv").as_path(), kv_db_opt, kv_cf_opts)?; let raft: RaftTestEngine = engine_test::raft::new_engine(p.join("raft").to_str().unwrap(), raft_db_opt)?; + let mut lb = raft.log_batch(regions.len() * 128); for ®ion_id in regions { // Put apply state into kv engine. let mut apply_state = RaftApplyState::default(); @@ -1997,7 +2375,7 @@ pub mod tests { apply_entry.set_term(0); apply_state.mut_truncated_state().set_index(10); kv.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - raft.append(region_id, vec![apply_entry])?; + lb.append(region_id, None, vec![apply_entry])?; // Put region info into kv engine. let region = gen_test_region(region_id, 1, 1); @@ -2005,13 +2383,14 @@ pub mod tests { region_state.set_region(region); kv.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)?; } - Ok(Engines { kv, raft }) + raft.consume(&mut lb, false).unwrap(); + Ok(Engines::new(kv, raft)) } - pub fn get_kv_count(snap: &impl EngineSnapshot) -> usize { + pub fn get_kv_count(snap: &impl EngineSnapshot) -> u64 { let mut kv_count = 0; for cf in SNAPSHOT_CFS { - snap.scan_cf( + snap.scan( cf, &keys::data_key(b"a"), &keys::data_key(b"z"), @@ -2069,6 +2448,7 @@ pub mod tests { encryption_key_manager: None, max_per_file_size: Arc::new(AtomicU64::new(max_per_file_size)), enable_multi_snapshot_files: Arc::new(AtomicBool::new(true)), + stats: Default::default(), } } @@ -2104,9 +2484,9 @@ pub mod tests { (dir, key_manager.unwrap()) } - pub fn gen_db_options_with_encryption(prefix: &str) -> (TempDir, DBOptions) { + pub fn gen_db_options_with_encryption(prefix: &str) -> (TempDir, DbOptions) { let (_enc_dir, key_manager) = create_encryption_key_manager(prefix); - let mut db_opts = DBOptions::default(); + let mut db_opts = DbOptions::default(); db_opts.set_key_manager(Some(key_manager)); (_enc_dir, db_opts) } @@ -2123,7 +2503,7 @@ pub mod tests { }; cf_file.push(f); } - let meta = super::gen_snapshot_meta(&cf_file).unwrap(); + let meta = super::gen_snapshot_meta(&cf_file, false).unwrap(); let cf_files = meta.get_cf_files(); assert_eq!(cf_files.len(), super::SNAPSHOT_CFS.len() * 2); // each CF has two snapshot files; for (i, cf_file_meta) in meta.get_cf_files().iter().enumerate() { @@ -2181,7 +2561,7 @@ pub mod tests { test_snap_file(open_test_db_with_100keys, 500); } - fn test_snap_file(get_db: DBBuilder, max_file_size: u64) { + fn test_snap_file(get_db: DbBuilder, max_file_size: u64) { let region_id = 1; let region = gen_test_region(region_id, 1, 1); let src_db_dir = Builder::new() @@ -2205,28 +2585,16 @@ pub mod tests { assert!(!s1.exists()); assert_eq!(mgr_core.get_total_snap_size().unwrap(), 0); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let mut snap_data = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); // Ensure that this snapshot file does exist after being built. assert!(s1.exists()); - let total_size = s1.total_size().unwrap(); + let size = s1.total_size(); // Ensure the `size_track` is modified correctly. - let size = mgr_core.get_total_snap_size().unwrap(); - assert_eq!(size, total_size); - assert_eq!(stat.size as u64, size); - assert_eq!(stat.kv_count, get_kv_count(&snapshot)); + assert_eq!(size, mgr_core.get_total_snap_size().unwrap()); + assert_eq!(s1.total_count(), get_kv_count(&snapshot)); // Ensure this snapshot could be read for sending. let mut s2 = Snapshot::new_for_sending(src_dir.path(), &key, &mgr_core).unwrap(); @@ -2267,7 +2635,7 @@ pub mod tests { let dst_db_path = dst_db_dir.path().to_str().unwrap(); // Change arbitrarily the cf order of ALL_CFS at destination db. let dst_cfs = [CF_WRITE, CF_DEFAULT, CF_LOCK, CF_RAFT]; - let dst_db = engine_test::kv::new_engine(dst_db_path, None, &dst_cfs, None).unwrap(); + let dst_db = engine_test::kv::new_engine(dst_db_path, &dst_cfs).unwrap(); let options = ApplyOptions { db: dst_db.clone(), region, @@ -2300,7 +2668,7 @@ pub mod tests { test_snap_validation(open_test_db_with_100keys, 500); } - fn test_snap_validation(get_db: DBBuilder, max_file_size: u64) { + fn test_snap_validation(get_db: DbBuilder, max_file_size: u64) { let region_id = 1; let region = gen_test_region(region_id, 1, 1); let db_dir = Builder::new() @@ -2319,34 +2687,17 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s1.exists()); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(s2.exists()); - Snapshot::build::( - &mut s2, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s2 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s2.exists()); } @@ -2370,7 +2721,8 @@ pub mod tests { } } - // Make all the snapshot in the specified dir corrupted to have incorrect checksum. + // Make all the snapshot in the specified dir corrupted to have incorrect + // checksum. fn corrupt_snapshot_checksum_in>(dir: T) -> Vec { let dir_path = dir.into(); let mut res = Vec::new(); @@ -2415,7 +2767,8 @@ pub mod tests { res } - // Make all the snapshot meta files in the specified corrupted to have incorrect content. + // Make all the snapshot meta files in the specified corrupted to have incorrect + // content. fn corrupt_snapshot_meta_file>(dir: T) -> usize { let mut total = 0; let dir_path = dir.into(); @@ -2487,37 +2840,20 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s1.exists()); corrupt_snapshot_size_in(dir.path()); - assert!(Snapshot::new_for_sending(dir.path(), &key, &mgr_core,).is_err()); + Snapshot::new_for_sending(dir.path(), &key, &mgr_core).unwrap_err(); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); - Snapshot::build::( - &mut s2, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let snap_data = s2 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s2.exists()); let dst_dir = Builder::new() @@ -2551,11 +2887,11 @@ pub mod tests { write_batch_size: TEST_WRITE_BATCH_SIZE, coprocessor_host: CoprocessorHost::::default(), }; - assert!(s5.apply(options).is_err()); + s5.apply(options).unwrap_err(); corrupt_snapshot_size_in(dst_dir.path()); - assert!(Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_meta,).is_err()); - assert!(Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).is_err()); + Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_meta).unwrap_err(); + Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap_err(); } #[test] @@ -2578,37 +2914,20 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s1.exists()); assert_eq!(1, corrupt_snapshot_meta_file(dir.path())); - assert!(Snapshot::new_for_sending(dir.path(), &key, &mgr_core,).is_err()); + Snapshot::new_for_sending(dir.path(), &key, &mgr_core).unwrap_err(); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); - Snapshot::build::( - &mut s2, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let mut snap_data = s2 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s2.exists()); let dst_dir = Builder::new() @@ -2625,11 +2944,9 @@ pub mod tests { assert_eq!(1, corrupt_snapshot_meta_file(dst_dir.path())); - assert!(Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core,).is_err()); - assert!( - Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_data.take_meta(),) - .is_err() - ); + Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap_err(); + Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_data.take_meta()) + .unwrap_err(); } #[test] @@ -2651,7 +2968,7 @@ pub mod tests { let path2 = temp_path2.to_str().unwrap().to_owned(); File::create(temp_path2).unwrap(); mgr = SnapManager::new(path2); - assert!(mgr.init().is_err()); + mgr.init().unwrap_err(); } #[test] @@ -2672,21 +2989,11 @@ pub mod tests { let mgr_core = create_manager_core(&path, u64::MAX); let mut s1 = Snapshot::new_for_building(&path, &key1, &mgr_core).unwrap(); let mut region = gen_test_region(1, 1, 1); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let mut snap_data = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); let mut s = Snapshot::new_for_sending(&path, &key1, &mgr_core).unwrap(); - let expected_size = s.total_size().unwrap(); + let expected_size = s.total_size(); let mut s2 = Snapshot::new_for_receiving(&path, &key1, &mgr_core, snap_data.get_meta().clone()) .unwrap(); @@ -2756,19 +3063,16 @@ pub mod tests { // Ensure the snapshot being built will not be deleted on GC. src_mgr.register(key.clone(), SnapEntry::Generating); let mut s1 = src_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - s1.build(&db, &snapshot, ®ion, &mut snap_data, &mut stat, true) + let mut snap_data = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) .unwrap(); - let v = snap_data.write_to_bytes().unwrap(); check_registry_around_deregister(&src_mgr, &key, &SnapEntry::Generating); // Ensure the snapshot being sent will not be deleted on GC. src_mgr.register(key.clone(), SnapEntry::Sending); let mut s2 = src_mgr.get_snapshot_for_sending(&key).unwrap(); - let expected_size = s2.total_size().unwrap(); + let expected_size = s2.total_size(); let dst_temp_dir = Builder::new() .prefix("test-snap-deletion-on-registry-dst") @@ -2780,7 +3084,9 @@ pub mod tests { // Ensure the snapshot being received will not be deleted on GC. dst_mgr.register(key.clone(), SnapEntry::Receiving); - let mut s3 = dst_mgr.get_snapshot_for_receiving(&key, &v[..]).unwrap(); + let mut s3 = dst_mgr + .get_snapshot_for_receiving(&key, snap_data.take_meta()) + .unwrap(); let n = io::copy(&mut s2, &mut s3).unwrap(); assert_eq!(n, expected_size); s3.save().unwrap(); @@ -2813,10 +3119,10 @@ pub mod tests { let kv_cf_opts = ALL_CFS .iter() .map(|cf| { - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = CfOptions::new(); cf_opts.set_no_range_properties(true); cf_opts.set_no_table_properties(true); - CFOptions::new(cf, cf_opts) + (*cf, cf_opts) }) .collect(); let engine = @@ -2830,24 +3136,22 @@ pub mod tests { let snap_mgr = SnapManagerBuilder::default() .max_total_size(max_total_size) .build::<_>(snapfiles_path.path().to_str().unwrap()); + snap_mgr.init().unwrap(); let snapshot = engine.kv.snapshot(); // Add an oldest snapshot for receiving. let recv_key = SnapKey::new(100, 100, 100); - let recv_head = { - let mut stat = SnapshotStatistics::new(); - let mut snap_data = RaftSnapshotData::default(); + let mut recv_head = { let mut s = snap_mgr.get_snapshot_for_building(&recv_key).unwrap(); s.build( &engine.kv, &snapshot, &gen_test_region(100, 1, 1), - &mut snap_data, - &mut stat, true, + false, + UnixSecs::now(), ) - .unwrap(); - snap_data.write_to_bytes().unwrap() + .unwrap() }; let recv_remain = { let mut data = Vec::with_capacity(1024); @@ -2857,30 +3161,21 @@ pub mod tests { data }; let mut s = snap_mgr - .get_snapshot_for_receiving(&recv_key, &recv_head) + .get_snapshot_for_receiving(&recv_key, recv_head.take_meta()) .unwrap(); s.write_all(&recv_remain).unwrap(); s.save().unwrap(); + let snap_size = snap_mgr.get_total_snap_size().unwrap(); + let max_snap_count = (max_total_size + snap_size - 1) / snap_size; for (i, region_id) in regions.into_iter().enumerate() { let key = SnapKey::new(region_id, 1, 1); let region = gen_test_region(region_id, 1, 1); let mut s = snap_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = RaftSnapshotData::default(); - let mut stat = SnapshotStatistics::new(); - s.build( - &engine.kv, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s + .build(&engine.kv, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); - // TODO: this size may change in different RocksDB version. - let snap_size = 1660; - let max_snap_count = (max_total_size + snap_size - 1) / snap_size; // The first snap_size is for region 100. // That snapshot won't be deleted because it's not for generating. assert_eq!( @@ -2920,6 +3215,33 @@ pub mod tests { assert!(!file_system::file_exists(&sst_path)); } + #[test] + fn test_snapshot_stats() { + let snap_dir = Builder::new() + .prefix("test_snapshot_stats") + .tempdir() + .unwrap(); + let start = Instant::now(); + let mgr = TabletSnapManager::new(snap_dir.path(), None).unwrap(); + let key = TabletSnapKey::new(1, 1, 1, 1); + mgr.begin_snapshot(key.clone(), start - time::Duration::from_secs(2), 1); + // filter out the snapshot that is not finished + assert!(mgr.stats().stats.is_empty()); + mgr.finish_snapshot(key.clone(), start - time::Duration::from_secs(1)); + let stats = mgr.stats().stats; + assert_eq!(stats.len(), 1); + assert_eq!(stats[0].get_total_duration_sec(), 2); + assert!(mgr.stats().stats.is_empty()); + + // filter out the total duration seconds less than one sencond. + let path = mgr.tablet_gen_path(&key); + std::fs::create_dir_all(&path).unwrap(); + assert!(path.exists()); + mgr.delete_snapshot(&key); + assert_eq!(mgr.stats().stats.len(), 0); + assert!(!path.exists()); + } + #[test] fn test_build_with_encryption() { let (_enc_dir, key_manager) = @@ -2944,15 +3266,78 @@ pub mod tests { let key = SnapKey::new(1, 1, 1); let region = gen_test_region(1, 1, 1); - // Test one snapshot can be built multi times. DataKeyManager should be handled correctly. + // Test one snapshot can be built multi times. DataKeyManager should be handled + // correctly. for _ in 0..2 { let mut s1 = snap_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - s1.build(&db, &snapshot, ®ion, &mut snap_data, &mut stat, true) + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) .unwrap(); assert!(snap_mgr.delete_snapshot(&key, &s1, false)); } } + + #[test] + fn test_generate_snap_for_tablet_snapshot() { + let snap_dir = Builder::new().prefix("test_snapshot").tempdir().unwrap(); + let snap_mgr = SnapManagerBuilder::default() + .enable_receive_tablet_snapshot(true) + .build(snap_dir.path().to_str().unwrap()); + snap_mgr.init().unwrap(); + let tablet_snap_key = TabletSnapKey::new(1, 2, 3, 4); + snap_mgr + .gen_empty_snapshot_for_tablet_snapshot(&tablet_snap_key, false) + .unwrap(); + + let snap_key = SnapKey::new(1, 3, 4); + let s = snap_mgr.get_snapshot_for_applying(&snap_key).unwrap(); + let expect_path = snap_mgr + .tablet_snap_manager() + .as_ref() + .unwrap() + .final_recv_path(&tablet_snap_key); + assert_eq!(expect_path.to_str().unwrap(), s.tablet_snap_path().unwrap()); + } + + #[test] + fn test_init_enable_receive_tablet_snapshot() { + let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); + let snap_dir = Builder::new() + .prefix("test_snap_path_does_not_exist") + .tempdir() + .unwrap(); + let path = snap_dir.path().join("snap"); + let snap_mgr = builder.build(path.as_path().to_str().unwrap()); + snap_mgr.init().unwrap(); + + assert!(path.exists()); + let mut path = path.as_path().to_str().unwrap().to_string(); + path.push_str("_v2"); + assert!(Path::new(&path).exists()); + + let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); + let snap_dir = Builder::new() + .prefix("test_snap_path_exist") + .tempdir() + .unwrap(); + let path = snap_dir.path(); + let snap_mgr = builder.build(path.to_str().unwrap()); + snap_mgr.init().unwrap(); + + let mut path = path.to_str().unwrap().to_string(); + path.push_str("_v2"); + assert!(Path::new(&path).exists()); + + let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); + let snap_dir = Builder::new() + .prefix("test_tablet_snap_path_exist") + .tempdir() + .unwrap(); + let path = snap_dir.path().join("snap/v2"); + fs::create_dir_all(path).unwrap(); + let path = snap_dir.path().join("snap"); + let snap_mgr = builder.build(path.to_str().unwrap()); + snap_mgr.init().unwrap(); + assert!(path.exists()); + } } diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 2baf191d749..3cdee1e40f1 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -9,8 +9,7 @@ use std::{ }; use encryption::{ - encryption_method_from_db_encryption_method, DataKeyManager, DecrypterReader, EncrypterWriter, - Iv, + from_engine_encryption_method, DataKeyManager, DecrypterReader, EncrypterWriter, Iv, }; use engine_traits::{ CfName, EncryptionKeyManager, Error as EngineError, Iterable, KvEngine, Mutable, @@ -21,7 +20,7 @@ use tikv_util::{ box_try, codec::bytes::{BytesEncoder, CompactBytesFromFileDecoder}, debug, info, - time::Limiter, + time::{Instant, Limiter}, }; use super::{CfFile, Error, IO_LIMITER_CHUNK_SIZE}; @@ -61,7 +60,7 @@ where if let Some(key_mgr) = key_mgr { let enc_info = box_try!(key_mgr.new_file(path)); - let mthd = encryption_method_from_db_encryption_method(enc_info.method); + let mthd = from_engine_encryption_method(enc_info.method); if mthd != EncryptionMethod::Plaintext { let writer = box_try!(EncrypterWriter::new( file.take().unwrap(), @@ -81,7 +80,7 @@ where }; let mut stats = BuildStatistics::default(); - box_try!(snap.scan_cf(cf, start_key, end_key, false, |key, value| { + box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { stats.key_count += 1; stats.total_size += key.len() + value.len(); box_try!(BytesEncoder::encode_compact_bytes(&mut writer, key)); @@ -133,7 +132,9 @@ where .to_string(); let sst_writer = RefCell::new(create_sst_file_writer::(engine, cf, &path)?); let mut file_length: usize = 0; - box_try!(snap.scan_cf(cf, start_key, end_key, false, |key, value| { + + let instant = Instant::now(); + box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { let entry_len = key.len() + value.len(); if file_length + entry_len > raw_size_per_file as usize { cf_file.add_file(file_id); // add previous file @@ -151,7 +152,7 @@ where Ok(new_sst_writer) => { let old_writer = sst_writer.replace(new_sst_writer); box_try!(old_writer.finish()); - box_try!(File::open(&prev_path).and_then(|f| f.sync_all())); + box_try!(File::open(prev_path).and_then(|f| f.sync_all())); } Err(e) => { let io_error = io::Error::new(io::ErrorKind::Other, e); @@ -159,6 +160,7 @@ where } } } + while entry_len > remained_quota { // It's possible to acquire more than necessary, but let it be. io_limiter.blocking_consume(IO_LIMITER_CHUNK_SIZE); @@ -180,12 +182,13 @@ where box_try!(sst_writer.into_inner().finish()); box_try!(File::open(path).and_then(|f| f.sync_all())); info!( - "build_sst_cf_file_list builds {} files in cf {}. Total keys {}, total size {}. raw_size_per_file {}", + "build_sst_cf_file_list builds {} files in cf {}. Total keys {}, total size {}. raw_size_per_file {}, total takes {:?}", file_id + 1, cf, stats.key_count, stats.total_size, raw_size_per_file, + instant.saturating_elapsed(), ); } else { box_try!(fs::remove_file(path)); @@ -193,8 +196,8 @@ where Ok(stats) } -/// Apply the given snapshot file into a column family. `callback` will be invoked after each batch of -/// key value pairs written to db. +/// Apply the given snapshot file into a column family. `callback` will be +/// invoked after each batch of key value pairs written to db. pub fn apply_plain_cf_file( path: &str, key_mgr: Option<&Arc>, @@ -226,7 +229,8 @@ where Ok(()) }; - // Collect keys to a vec rather than wb so that we can invoke the callback less times. + // Collect keys to a vec rather than wb so that we can invoke the callback less + // times. let mut batch = Vec::with_capacity(1024); let mut batch_data_size = 0; @@ -283,7 +287,7 @@ pub fn get_decrypter_reader( encryption_key_manager: &DataKeyManager, ) -> Result, Error> { let enc_info = box_try!(encryption_key_manager.get_file(file)); - let mthd = encryption_method_from_db_encryption_method(enc_info.method); + let mthd = from_engine_encryption_method(enc_info.method); debug!( "get_decrypter_reader gets enc_info for {:?}, method: {:?}", file, mthd @@ -375,7 +379,7 @@ mod tests { // Scan keys from db let mut keys_in_db: HashMap<_, Vec<_>> = HashMap::new(); for cf in SNAPSHOT_CFS { - snap.scan_cf( + snap.scan( cf, &keys::data_key(b"a"), &keys::data_end_key(b"z"), diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 586b80ed6e5..7f10e7cd249 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -6,8 +6,9 @@ use std::sync::mpsc; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; -use tikv_util::error; +use tikv_util::{error, warn}; +use super::{AsyncReadNotifier, FetchedLogs, GenSnapRes}; use crate::{ store::{CasualMessage, PeerMsg, RaftCommand, RaftRouter, SignificantMsg, StoreMsg}, DiscardReason, Error, Result, @@ -90,7 +91,13 @@ where .force_send(region_id, PeerMsg::SignificantMsg(msg)) { // TODO: panic here once we can detect system is shutting down reliably. - error!("failed to send significant msg"; "msg" => ?msg); + + // Avoid printing error log if it's not a severe problem failing to send it. + if msg.is_send_failure_ignorable() { + warn!("failed to send significant msg"; "msg" => ?msg); + } else { + error!("failed to send significant msg"; "msg" => ?msg); + } return Err(Error::RegionNotFound(region_id)); } @@ -165,3 +172,16 @@ where } } } + +impl AsyncReadNotifier for RaftRouter { + #[inline] + fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs) { + // Ignore region not found as it may be removed. + let _ = self.significant_send(region_id, SignificantMsg::RaftlogFetched(fetched)); + } + + #[inline] + fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: GenSnapRes) { + unreachable!() + } +} diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 1d8e7ed1981..ccc4027e9d1 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -15,13 +15,13 @@ use txn_types::{Key, PessimisticLock}; /// Transaction extensions related to a peer. #[derive(Default)] pub struct TxnExt { - /// The max timestamp recorded in the concurrency manager is only updated at leader. - /// So if a peer becomes leader from a follower, the max timestamp can be outdated. - /// We need to update the max timestamp with a latest timestamp from PD before this - /// peer can work. - /// From the least significant to the most, 1 bit marks whether the timestamp is - /// updated, 31 bits for the current epoch version, 32 bits for the current term. - /// The version and term are stored to prevent stale UpdateMaxTimestamp task from + /// The max timestamp recorded in the concurrency manager is only updated at + /// leader. So if a peer becomes leader from a follower, the max timestamp + /// can be outdated. We need to update the max timestamp with a latest + /// timestamp from PD before this peer can work. From the least significant + /// to the most, 1 bit marks whether the timestamp is updated, 31 bits for + /// the current epoch version, 32 bits for the current term. The version + /// and term are stored to prevent stale UpdateMaxTimestamp task from /// marking the lowest bit. pub max_ts_sync_status: AtomicU64, @@ -58,7 +58,8 @@ lazy_static! { const GLOBAL_MEM_SIZE_LIMIT: usize = 100 << 20; // 100 MiB -// 512 KiB, so pessimistic locks in one region can be proposed in a single command. +// 512 KiB, so pessimistic locks in one region can be proposed in a single +// command. const PEER_MEM_SIZE_LIMIT: usize = 512 << 10; /// Pessimistic locks of a region peer. @@ -66,51 +67,53 @@ const PEER_MEM_SIZE_LIMIT: usize = 512 << 10; pub struct PeerPessimisticLocks { /// The table that stores pessimistic locks. /// - /// The bool marks an ongoing write request (which has been sent to the raftstore while not - /// applied yet) will delete this lock. The lock will be really deleted after applying the - /// write request. The flag will decide whether this lock should be migrated to other peers - /// on leader or region changes: + /// The bool marks an ongoing write request (which has been sent to the + /// raftstore while not applied yet) will delete this lock. The lock will be + /// really deleted after applying the write request. The flag will decide + /// whether this lock should be migrated to other peers on leader or region + /// changes: /// - /// - Transfer leader - /// The lock with the deleted mark SHOULD NOT be proposed before transferring leader. - /// Considering the following cases with different orders: - /// 1. Propose write -> propose locks -> apply write -> apply locks -> transfer leader - /// Because the locks marking deleted will not be proposed. The lock will be deleted when - /// applying the write while not showing up again after applying the locks. - /// 2. Propose locks -> propose write -> transfer leader - /// No lock will be lost in normal cases because the write request has been sent to the - /// raftstore, it is likely to be proposed successfully, while the leader will need at - /// least another round to receive the transfer leader message from the transferree. + /// - Transfer leader The lock with the deleted mark SHOULD NOT be proposed + /// before transferring leader. Considering the following cases with + /// different orders: 1. Propose write -> propose locks -> apply write -> + /// apply locks -> transfer leader Because the locks marking deleted will + /// not be proposed. The lock will be deleted when applying the write + /// while not showing up again after applying the locks. 2. Propose locks + /// -> propose write -> transfer leader No lock will be lost in normal + /// cases because the write request has been sent to the raftstore, it is + /// likely to be proposed successfully, while the leader will need at + /// least another round to receive the transfer leader message from the + /// transferee. /// - /// - Split region - /// The lock with the deleted mark SHOULD be moved to new regions on region split. - /// Considering the following cases with different orders: - /// 1. Propose write -> propose split -> apply write -> execute split - /// The write will be applied earlier than split. So, the lock will be deleted earlier - /// than moving locks to new regions. - /// 2. Propose split -> propose write -> ready split -> apply write - /// The write will be skipped because its version is lower than the new region. So, no - /// lock should be deleted in this case. - /// 3. Propose split -> ready split -> propose write - /// The write proposal will be rejected because of version mismatch. + /// - Split region The lock with the deleted mark SHOULD be moved to new + /// regions on region split. Considering the following cases with + /// different orders: 1. Propose write -> propose split -> apply write -> + /// execute split The write will be applied earlier than split. So, the + /// lock will be deleted earlier than moving locks to new regions. 2. + /// Propose split -> propose write -> ready split -> apply write The write + /// will be skipped because its version is lower than the new region. So, + /// no lock should be deleted in this case. 3. Propose split -> ready + /// split -> propose write The write proposal will be rejected because of + /// version mismatch. /// - /// - Merge region - /// The lock with the deleted mark SHOULD be included in the catch up logs on region merge. - /// Considering the following cases with different orders: - /// 1. Propose write -> propose prepare merge -> apply write -> execute merge - /// The locks marked deleted will be deleted when applying the write request. So, the - /// deleted locks will not be included again in the commit merge request. - /// 2. Propose prepare merge -> propose write -> execute merge -> apply write - /// Applying the write will be skipped because of version mismatch. So, no lock should - /// be deleted. It's correct that we include the locks that are marked deleted in the - /// commit merge request. + /// - Merge region The lock with the deleted mark SHOULD be included in the + /// catch up logs on region merge. Considering the following cases with + /// different orders: 1. Propose write -> propose prepare merge -> apply + /// write -> execute merge The locks marked deleted will be deleted when + /// applying the write request. So, the deleted locks will not be included + /// again in the commit merge request. 2. Propose prepare merge -> propose + /// write -> execute merge -> apply write Applying the write will be + /// skipped because of version mismatch. So, no lock should be deleted. + /// It's correct that we include the locks that are marked deleted in the + /// commit merge request. map: HashMap, /// Status of the pessimistic lock map. /// The map is writable only in the Normal state. pub status: LocksStatus, /// Refers to the Raft term in which the pessimistic lock table is valid. pub term: u64, - /// Refers to the region version in which the pessimistic lock table is valid. + /// Refers to the region version in which the pessimistic lock table is + /// valid. pub version: u64, /// Estimated memory used by the pessimistic locks. pub memory_size: usize, @@ -158,8 +161,8 @@ impl PeerPessimisticLocks { for pair in &pairs { let (key, lock) = pair.as_pair(); // If the key already exists in the map, it's an overwrite. - // The primary lock does not change during an overwrite, so we don't need to update - // the memory size. + // The primary lock does not change during an overwrite, so we don't need to + // update the memory size. if !self.map.contains_key(key) { incr += key.len() + lock.memory_size(); } @@ -215,11 +218,12 @@ impl PeerPessimisticLocks { /// Group pessimistic locks in the original region to the split regions. /// - /// The given regions MUST be sorted by key in the ascending order. The returned - /// `HashMap`s are in the same order of the given regions. + /// The given regions MUST be sorted by key in the ascending order. The + /// returned `HashMap`s are in the same order of the given regions. /// - /// The locks belonging to the derived region will be kept in the given `locks` map, - /// and the corresponding position in the returned `Vec` will be an empty map. + /// The locks belonging to the derived region will be kept in the given + /// `locks` map, and the corresponding position in the returned `Vec` + /// will be an empty map. pub fn group_by_regions( &mut self, regions: &[metapb::Region], @@ -318,8 +322,10 @@ mod tests { primary: primary.to_vec().into_boxed_slice(), start_ts: 100.into(), ttl: 3000, - for_update_ts: 100.into(), - min_commit_ts: Default::default(), + for_update_ts: 110.into(), + min_commit_ts: 110.into(), + last_change_ts: 105.into(), + versions_to_last_change: 2, } } @@ -334,10 +340,10 @@ mod tests { let k3 = Key::from_raw(b"k333"); // Test the memory size of peer pessimistic locks after inserting. - assert!(locks1.insert(vec![(k1.clone(), lock(b"k1"))]).is_ok()); + locks1.insert(vec![(k1.clone(), lock(b"k1"))]).unwrap(); assert_eq!(locks1.get(&k1), Some(&(lock(b"k1"), false))); assert_eq!(locks1.memory_size, k1.len() + lock(b"k1").memory_size()); - assert!(locks1.insert(vec![(k2.clone(), lock(b"k1"))]).is_ok()); + locks1.insert(vec![(k2.clone(), lock(b"k1"))]).unwrap(); assert_eq!(locks1.get(&k2), Some(&(lock(b"k1"), false))); assert_eq!( locks1.memory_size, @@ -345,7 +351,7 @@ mod tests { ); // Test the global memory size after inserting. - assert!(locks2.insert(vec![(k3.clone(), lock(b"k1"))]).is_ok()); + locks2.insert(vec![(k3.clone(), lock(b"k1"))]).unwrap(); assert_eq!(locks2.get(&k3), Some(&(lock(b"k1"), false))); assert_eq!( GLOBAL_MEM_SIZE.get() as usize, @@ -353,7 +359,7 @@ mod tests { ); // Test the memory size after replacing, it should not change. - assert!(locks1.insert(vec![(k2.clone(), lock(b"k2"))]).is_ok()); + locks1.insert(vec![(k2.clone(), lock(b"k2"))]).unwrap(); assert_eq!(locks1.get(&k2), Some(&(lock(b"k2"), false))); assert_eq!( locks1.memory_size, @@ -391,18 +397,20 @@ mod tests { defer!(GLOBAL_MEM_SIZE.set(0)); let mut locks = PeerPessimisticLocks::default(); - let res = locks.insert(vec![(Key::from_raw(b"k1"), lock(&[0; 512000]))]); - assert!(res.is_ok()); + locks + .insert(vec![(Key::from_raw(b"k1"), lock(&[0; 512000]))]) + .unwrap(); // Exceeding the region limit - let res = locks.insert(vec![(Key::from_raw(b"k2"), lock(&[0; 32000]))]); - assert!(res.is_err()); + locks + .insert(vec![(Key::from_raw(b"k2"), lock(&[0; 32000]))]) + .unwrap_err(); assert!(locks.get(&Key::from_raw(b"k2")).is_none()); // Not exceeding the region limit, but exceeding the global limit GLOBAL_MEM_SIZE.set(101 << 20); let res = locks.insert(vec![(Key::from_raw(b"k2"), lock(b"abc"))]); - assert!(res.is_err()); + res.unwrap_err(); assert!(locks.get(&Key::from_raw(b"k2")).is_none()); } @@ -418,6 +426,8 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 2, }, deleted, ), diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 75c620ac12c..f5a23538ad5 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -9,74 +9,51 @@ use std::{ option::Option, sync::{ atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}, - Arc, Mutex, + Arc, Mutex, MutexGuard, }, u64, }; +use collections::HashSet; +use engine_traits::KvEngine; use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, metapb::{self, Peer, PeerRole, Region, RegionEpoch}, - raft_cmdpb::{AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest}, - raft_serverpb::RaftMessage, + raft_cmdpb::{ + AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, RaftRequestHeader, + }, + raft_serverpb::{RaftMessage, RaftSnapshotData}, }; -use protobuf::{self, Message}; +use protobuf::{self, CodedInputStream, Message}; use raft::{ - eraftpb::{self, ConfChangeType, ConfState, MessageType}, - INVALID_INDEX, + eraftpb::{self, ConfChangeType, ConfState, Entry, EntryType, MessageType, Snapshot}, + Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; -use tikv_util::{box_err, debug, info, time::monotonic_raw_now, Either}; +use tikv_util::{ + box_err, + codec::number::{decode_u64, NumberEncoder}, + debug, info, + store::{find_peer_by_id, region}, + time::monotonic_raw_now, + Either, +}; use time::{Duration, Timespec}; +use tokio::sync::Notify; +use txn_types::WriteBatchFlags; + +use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; +use crate::{ + coprocessor::CoprocessorHost, + store::{simple_write::SimpleWriteReqDecoder, snap::SNAPSHOT_VERSION}, + Error, Result, +}; -use super::peer_storage; -use crate::{Error, Result}; - -pub fn find_peer(region: &metapb::Region, store_id: u64) -> Option<&metapb::Peer> { - region - .get_peers() - .iter() - .find(|&p| p.get_store_id() == store_id) -} - -pub fn find_peer_mut(region: &mut metapb::Region, store_id: u64) -> Option<&mut metapb::Peer> { - region - .mut_peers() - .iter_mut() - .find(|p| p.get_store_id() == store_id) -} - -pub fn remove_peer(region: &mut metapb::Region, store_id: u64) -> Option { - region - .get_peers() - .iter() - .position(|x| x.get_store_id() == store_id) - .map(|i| region.mut_peers().remove(i)) -} - -// a helper function to create peer easily. -pub fn new_peer(store_id: u64, peer_id: u64) -> metapb::Peer { - let mut peer = metapb::Peer::default(); - peer.set_store_id(store_id); - peer.set_id(peer_id); - peer.set_role(PeerRole::Voter); - peer -} - -// a helper function to create learner peer easily. -pub fn new_learner_peer(store_id: u64, peer_id: u64) -> metapb::Peer { - let mut peer = metapb::Peer::default(); - peer.set_store_id(store_id); - peer.set_id(peer_id); - peer.set_role(PeerRole::Learner); - peer -} +const INVALID_TIMESTAMP: u64 = u64::MAX; /// Check if key in region range (`start_key`, `end_key`). pub fn check_key_in_region_exclusive(key: &[u8], region: &metapb::Region) -> Result<()> { - let end_key = region.get_end_key(); - let start_key = region.get_start_key(); - if start_key < key && (key < end_key || end_key.is_empty()) { + if region::check_key_in_region_exclusive(key, region) { Ok(()) } else { Err(Error::KeyNotInRegion(key.to_vec(), region.clone())) @@ -85,9 +62,7 @@ pub fn check_key_in_region_exclusive(key: &[u8], region: &metapb::Region) -> Res /// Check if key in region range [`start_key`, `end_key`]. pub fn check_key_in_region_inclusive(key: &[u8], region: &metapb::Region) -> Result<()> { - let end_key = region.get_end_key(); - let start_key = region.get_start_key(); - if key >= start_key && (end_key.is_empty() || key <= end_key) { + if region::check_key_in_region_inclusive(key, region) { Ok(()) } else { Err(Error::KeyNotInRegion(key.to_vec(), region.clone())) @@ -96,19 +71,17 @@ pub fn check_key_in_region_inclusive(key: &[u8], region: &metapb::Region) -> Res /// Check if key in region range [`start_key`, `end_key`). pub fn check_key_in_region(key: &[u8], region: &metapb::Region) -> Result<()> { - let end_key = region.get_end_key(); - let start_key = region.get_start_key(); - if key >= start_key && (end_key.is_empty() || key < end_key) { + if region::check_key_in_region(key, region) { Ok(()) } else { Err(Error::KeyNotInRegion(key.to_vec(), region.clone())) } } -/// `is_first_vote_msg` checks `msg` is the first vote (or prevote) message or not. It's used for -/// when the message is received but there is no such region in `Store::region_peers` and the -/// region overlaps with others. In this case we should put `msg` into `pending_msg` instead of -/// create the peer. +/// `is_first_vote_msg` checks `msg` is the first vote (or prevote) message or +/// not. It's used for when the message is received but there is no such region +/// in `Store::region_peers` and the region overlaps with others. In this case +/// we should put `msg` into `pending_msg` instead of create the peer. #[inline] fn is_first_vote_msg(msg: &eraftpb::Message) -> bool { match msg.get_msg_type() { @@ -119,10 +92,11 @@ fn is_first_vote_msg(msg: &eraftpb::Message) -> bool { } } -/// `is_first_append_entry` checks `msg` is the first append message or not. This meassge is the first -/// message that the learner peers of the new split region will receive from the leader. It's used for -/// when the message is received but there is no such region in `Store::region_peers`. In this case we -/// should put `msg` into `pending_msg` instead of create the peer. +/// `is_first_append_entry` checks `msg` is the first append message or not. +/// This meassge is the first message that the learner peers of the new split +/// region will receive from the leader. It's used for when the message is +/// received but there is no such region in `Store::region_peers`. In this case +/// we should put `msg` into `pending_msg` instead of create the peer. #[inline] fn is_first_append_entry(msg: &eraftpb::Message) -> bool { match msg.get_msg_type() { @@ -146,7 +120,8 @@ pub fn is_vote_msg(msg: &eraftpb::Message) -> bool { msg_type == MessageType::MsgRequestVote || msg_type == MessageType::MsgRequestPreVote } -/// `is_initial_msg` checks whether the `msg` can be used to initialize a new peer or not. +/// `is_initial_msg` checks whether the `msg` can be used to initialize a new +/// peer or not. // There could be two cases: // 1. Target peer already exists but has not established communication with leader yet // 2. Target peer is added newly due to member change or region split, but it's not @@ -164,6 +139,27 @@ pub fn is_initial_msg(msg: &eraftpb::Message) -> bool { || (msg_type == MessageType::MsgHeartbeat && msg.get_commit() == INVALID_INDEX) } +pub fn new_empty_snapshot( + region: Region, + applied_index: u64, + applied_term: u64, + for_witness: bool, +) -> Snapshot { + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().set_index(applied_index); + snapshot.mut_metadata().set_term(applied_term); + snapshot + .mut_metadata() + .set_conf_state(conf_state_from_region(®ion)); + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region); + snap_data.set_file_size(0); + snap_data.set_version(SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_witness(for_witness); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + snapshot +} + const STR_CONF_CHANGE_ADD_NODE: &str = "AddNode"; const STR_CONF_CHANGE_REMOVE_NODE: &str = "RemoveNode"; const STR_CONF_CHANGE_ADDLEARNER_NODE: &str = "AddLearner"; @@ -207,12 +203,13 @@ impl AdminCmdEpochState { } /// WARNING: the existing settings below **MUST NOT** be changed!!! -/// Changing any admin cmd's `AdminCmdEpochState` or the epoch-change behavior during applying -/// will break upgrade compatibility and correctness dependency of `CmdEpochChecker`. -/// Please remember it is very difficult to fix the issues arising from not following this rule. +/// Changing any admin cmd's `AdminCmdEpochState` or the epoch-change behavior +/// during applying will break upgrade compatibility and correctness dependency +/// of `CmdEpochChecker`. Please remember it is very difficult to fix the issues +/// arising from not following this rule. /// -/// If you really want to change an admin cmd behavior, please add a new admin cmd and **DO NOT** -/// delete the old one. +/// If you really want to change an admin cmd behavior, please add a new admin +/// cmd and **DO NOT** delete the old one. pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochState { match admin_cmp_type { AdminCmdType::InvalidAdmin => AdminCmdEpochState::new(false, false, false, false), @@ -231,36 +228,62 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::RollbackMerge => AdminCmdEpochState::new(true, true, true, false), // Transfer leader AdminCmdType::TransferLeader => AdminCmdEpochState::new(true, true, false, false), + // PrepareFlashback could be committed successfully before a split being applied, so we need + // to check the epoch to make sure it's sent to a correct key range. + // NOTICE: FinishFlashback will never meet the epoch not match error since any scheduling + // before it's forbidden. + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + AdminCmdEpochState::new(true, true, false, false) + } + AdminCmdType::BatchSwitchWitness => AdminCmdEpochState::new(false, true, false, true), + AdminCmdType::UpdateGcPeer => AdminCmdEpochState::new(false, false, false, false), } } -/// WARNING: `NORMAL_REQ_CHECK_VER` and `NORMAL_REQ_CHECK_CONF_VER` **MUST NOT** be changed. -/// The reason is the same as `admin_cmd_epoch_lookup`. +/// WARNING: `NORMAL_REQ_CHECK_VER` and `NORMAL_REQ_CHECK_CONF_VER` **MUST NOT** +/// be changed. The reason is the same as `admin_cmd_epoch_lookup`. pub static NORMAL_REQ_CHECK_VER: bool = true; pub static NORMAL_REQ_CHECK_CONF_VER: bool = false; -pub fn check_region_epoch( +pub fn check_req_region_epoch( req: &RaftCmdRequest, region: &metapb::Region, include_region: bool, ) -> Result<()> { - let (check_ver, check_conf_ver) = if !req.has_admin_request() { - // for get/set/delete, we don't care conf_version. - (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER) + let admin_ty = if !req.has_admin_request() { + None } else { - let epoch_state = admin_cmd_epoch_lookup(req.get_admin_request().get_cmd_type()); - (epoch_state.check_ver, epoch_state.check_conf_ver) + Some(req.get_admin_request().get_cmd_type()) + }; + check_region_epoch(req.get_header(), admin_ty, region, include_region) +} + +pub fn check_region_epoch( + header: &RaftRequestHeader, + admin_ty: Option, + region: &metapb::Region, + include_region: bool, +) -> Result<()> { + let (check_ver, check_conf_ver) = match admin_ty { + None => { + // for get/set/delete, we don't care conf_version. + (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER) + } + Some(ty) => { + let epoch_state = admin_cmd_epoch_lookup(ty); + (epoch_state.check_ver, epoch_state.check_conf_ver) + } }; if !check_ver && !check_conf_ver { return Ok(()); } - if !req.get_header().has_region_epoch() { + if !header.has_region_epoch() { return Err(box_err!("missing epoch!")); } - let from_epoch = req.get_header().get_region_epoch(); + let from_epoch = header.get_region_epoch(); compare_region_epoch( from_epoch, region, @@ -316,6 +339,52 @@ pub fn compare_region_epoch( Ok(()) } +// Check if the request could be proposed/applied under the current state of the +// flashback. +pub fn check_flashback_state( + is_in_flashback: bool, + flashback_start_ts: u64, + req: &RaftCmdRequest, + region_id: u64, + skip_not_prepared: bool, +) -> Result<()> { + // The admin flashback cmd could be proposed/applied under any state. + if req.has_admin_request() + && (req.get_admin_request().get_cmd_type() == AdminCmdType::PrepareFlashback + || req.get_admin_request().get_cmd_type() == AdminCmdType::FinishFlashback) + { + return Ok(()); + } + // TODO: only use `flashback_start_ts` to check flashback state. + let is_in_flashback = is_in_flashback || flashback_start_ts > 0; + let is_flashback_request = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::FLASHBACK); + // If the region is in the flashback state: + // - A request with flashback flag will be allowed. + // - A read request whose `read_ts` is smaller than `flashback_start_ts` will + // be allowed. + if is_in_flashback && !is_flashback_request { + if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + if read_ts != 0 && read_ts < flashback_start_ts { + return Ok(()); + } + } + return Err(Error::FlashbackInProgress(region_id, flashback_start_ts)); + } + // If the region is not in the flashback state, the flashback request itself + // should be rejected. + if !is_in_flashback && is_flashback_request && !skip_not_prepared { + return Err(Error::FlashbackNotPrepared(region_id)); + } + Ok(()) +} + +pub fn encode_start_ts_into_flag_data(header: &mut RaftRequestHeader, start_ts: u64) { + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(start_ts).unwrap(); + header.set_flag_data(data.into()); +} + pub fn is_region_epoch_equal( from_epoch: &metapb::RegionEpoch, current_epoch: &metapb::RegionEpoch, @@ -325,8 +394,8 @@ pub fn is_region_epoch_equal( } #[inline] -pub fn check_store_id(req: &RaftCmdRequest, store_id: u64) -> Result<()> { - let peer = req.get_header().get_peer(); +pub fn check_store_id(header: &RaftRequestHeader, store_id: u64) -> Result<()> { + let peer = header.get_peer(); if peer.get_store_id() == store_id { Ok(()) } else { @@ -338,8 +407,7 @@ pub fn check_store_id(req: &RaftCmdRequest, store_id: u64) -> Result<()> { } #[inline] -pub fn check_term(req: &RaftCmdRequest, term: u64) -> Result<()> { - let header = req.get_header(); +pub fn check_term(header: &RaftRequestHeader, term: u64) -> Result<()> { if header.get_term() == 0 || term <= header.get_term() + 1 { Ok(()) } else { @@ -350,8 +418,7 @@ pub fn check_term(req: &RaftCmdRequest, term: u64) -> Result<()> { } #[inline] -pub fn check_peer_id(req: &RaftCmdRequest, peer_id: u64) -> Result<()> { - let header = req.get_header(); +pub fn check_peer_id(header: &RaftRequestHeader, peer_id: u64) -> Result<()> { if header.get_peer().get_id() == peer_id { Ok(()) } else { @@ -376,34 +443,21 @@ pub fn build_key_range(start_key: &[u8], end_key: &[u8], reverse_scan: bool) -> range } -/// Check if replicas of two regions are on the same stores. -pub fn region_on_same_stores(lhs: &metapb::Region, rhs: &metapb::Region) -> bool { - if lhs.get_peers().len() != rhs.get_peers().len() { - return false; - } - - // Because every store can only have one replica for the same region, - // so just one round check is enough. - lhs.get_peers().iter().all(|lp| { - rhs.get_peers() - .iter() - .any(|rp| rp.get_store_id() == lp.get_store_id() && rp.get_role() == lp.get_role()) - }) -} - #[inline] pub fn is_region_initialized(r: &metapb::Region) -> bool { !r.get_peers().is_empty() } -/// Lease records an expired time, for examining the current moment is in lease or not. -/// It's dedicated to the Raft leader lease mechanism, contains either state of -/// 1. Suspect Timestamp -/// A suspicious leader lease timestamp, which marks the leader may still hold or lose -/// its lease until the clock time goes over this timestamp. -/// 2. Valid Timestamp -/// A valid leader lease timestamp, which marks the leader holds the lease for now. -/// The lease is valid until the clock time goes over this timestamp. +/// Lease records an expired time, for examining the current moment is in lease +/// or not. It's dedicated to the Raft leader lease mechanism, contains either +/// state of +/// - Suspect Timestamp +/// - A suspicious leader lease timestamp, which marks the leader may still +/// hold or lose its lease until the clock time goes over this timestamp. +/// - Valid Timestamp +/// - A valid leader lease timestamp, which marks the leader holds the lease +/// for now. The lease is valid until the clock time goes over this +/// timestamp. /// /// ```text /// Time @@ -419,18 +473,19 @@ pub fn is_region_initialized(r: &metapb::Region) -> bool { /// ``` /// /// Note: -/// - Valid timestamp would increase when raft log entries are applied in current term. -/// - Suspect timestamp would be set after the message `MsgTimeoutNow` is sent by current peer. -/// The message `MsgTimeoutNow` starts a leader transfer procedure. During this procedure, -/// current peer as an old leader may still hold its lease or lose it. -/// It's possible there is a new leader elected and current peer as an old leader -/// doesn't step down due to network partition from the new leader. In that case, -/// current peer lose its leader lease. -/// Within this suspect leader lease expire time, read requests could not be performed -/// locally. -/// - The valid leader lease should be `lease = max_lease - (commit_ts - send_ts)` -/// And the expired timestamp for that leader lease is `commit_ts + lease`, -/// which is `send_ts + max_lease` in short. +/// - Valid timestamp would increase when raft log entries are applied in +/// current term. +/// - Suspect timestamp would be set after the message `MsgTimeoutNow` is sent +/// by current peer. The message `MsgTimeoutNow` starts a leader transfer +/// procedure. During this procedure, current peer as an old leader may +/// still hold its lease or lose it. It's possible there is a new leader +/// elected and current peer as an old leader doesn't step down due to +/// network partition from the new leader. In that case, current peer lose +/// its leader lease. Within this suspect leader lease expire time, read +/// requests could not be performed locally. +/// - The valid leader lease should be `lease = max_lease - (commit_ts - +/// send_ts)` And the expired timestamp for that leader lease is `commit_ts +/// + lease`, which is `send_ts + max_lease` in short. pub struct Lease { // A suspect timestamp is in the Either::Left(_), // a valid timestamp is in the Either::Right(_). @@ -443,7 +498,7 @@ pub struct Lease { remote: Option, } -#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[derive(Clone, Copy, PartialEq, Debug)] pub enum LeaseState { /// The lease is suspicious, may be invalid. Suspect, @@ -466,9 +521,9 @@ impl Lease { } } - /// The valid leader lease should be `lease = max_lease - (commit_ts - send_ts)` - /// And the expired timestamp for that leader lease is `commit_ts + lease`, - /// which is `send_ts + max_lease` in short. + /// The valid leader lease should be `lease = max_lease - (commit_ts - + /// send_ts)` And the expired timestamp for that leader lease is + /// `commit_ts + lease`, which is `send_ts + max_lease` in short. fn next_expired_time(&self, send_ts: Timespec) -> Timespec { send_ts + self.max_lease } @@ -595,8 +650,8 @@ impl fmt::Debug for Lease { } /// A remote lease, it can only be derived by `Lease`. It will be sent -/// to the local read thread, so name it remote. If Lease expires, the remote must -/// expire too. +/// to the local read thread, so name it remote. If Lease expires, the remote +/// must expire too. #[derive(Clone)] pub struct RemoteLease { expired_time: Arc, @@ -686,7 +741,7 @@ fn timespec_to_u64(ts: Timespec) -> u64 { /// /// # Panics /// -/// If nsec is negative or GE than 1_000_000_000(nano seconds pre second). +/// If nsec (nano seconds pre second) is not in [0, 1_000_000_000) range. #[inline] pub(crate) fn u64_to_timespec(u: u64) -> Timespec { let sec = u >> TIMESPEC_SEC_SHIFT; @@ -694,6 +749,36 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { Timespec::new(sec as i64, nsec as i32) } +pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { + if entry.get_entry_type() != EntryType::EntryNormal { + return RaftRequestHeader::default(); + } + let logger = slog_global::get_global().new(slog::o!()); + match SimpleWriteReqDecoder::new( + |_, _, _| RaftCmdRequest::default(), + &logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ) { + Ok(decoder) => decoder.header().clone(), + Err(_) => { + // request header is encoded into data + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return RaftRequestHeader::default(); + } + let (field_number, _) = is.read_tag_unpack().unwrap(); + let t = is.read_message().unwrap(); + // Header field is of number 1 + if field_number != 1 { + panic!("unexpected field number: {} {:?}", field_number, t); + } + t + } + } +} + /// Parse data of entry `index`. /// /// # Panics @@ -704,11 +789,36 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { pub fn parse_data_at(data: &[u8], index: u64, tag: &str) -> T { let mut result = T::default(); result.merge_from_bytes(data).unwrap_or_else(|e| { - panic!("{} data is corrupted at {}: {:?}", tag, index, e); + panic!( + "{} data is corrupted at {}: {:?}. hex value: {}", + tag, + index, + e, + log_wrappers::Value::value(data) + ); }); result } +pub enum RaftCmd<'a> { + V1(RaftCmdRequest), + V2(SimpleWriteReqDecoder<'a>), +} + +pub fn parse_raft_cmd_request<'a>(data: &'a [u8], index: u64, term: u64, tag: &str) -> RaftCmd<'a> { + let logger = slog_global::get_global().new(slog::o!()); + match SimpleWriteReqDecoder::new( + |_, _, _| parse_data_at(data, index, tag), + &logger, + data, + index, + term, + ) { + Ok(simple_write_decoder) => RaftCmd::V2(simple_write_decoder), + Err(cmd) => RaftCmd::V1(cmd), + } +} + /// Check if two regions are sibling. /// /// They are sibling only when they share borders and don't overlap. @@ -754,41 +864,43 @@ pub fn conf_state_from_region(region: &metapb::Region) -> ConfState { conf_state } -pub fn is_learner(peer: &metapb::Peer) -> bool { - peer.get_role() == PeerRole::Learner -} - pub struct KeysInfoFormatter< 'a, - I: std::iter::DoubleEndedIterator> - + std::iter::ExactSizeIterator> + T: 'a + AsRef<[u8]>, + I: std::iter::DoubleEndedIterator + + std::iter::ExactSizeIterator + Clone, >(pub I); impl< 'a, - I: std::iter::DoubleEndedIterator> - + std::iter::ExactSizeIterator> + T: 'a + AsRef<[u8]>, + I: std::iter::DoubleEndedIterator + + std::iter::ExactSizeIterator + Clone, -> fmt::Display for KeysInfoFormatter<'a, I> +> fmt::Display for KeysInfoFormatter<'a, T, I> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut it = self.0.clone(); match it.len() { 0 => write!(f, "(no key)"), - 1 => write!(f, "key {}", log_wrappers::Value::key(it.next().unwrap())), + 1 => write!( + f, + "key {}", + log_wrappers::Value::key(it.next().unwrap().as_ref()) + ), _ => write!( f, "{} keys range from {} to {}", it.len(), - log_wrappers::Value::key(it.next().unwrap()), - log_wrappers::Value::key(it.next_back().unwrap()) + log_wrappers::Value::key(it.next().unwrap().as_ref()), + log_wrappers::Value::key(it.next_back().unwrap().as_ref()) ), } } } -#[derive(PartialEq, Eq, Debug)] +#[derive(PartialEq, Debug, Clone, Copy)] pub enum ConfChangeKind { // Only contains one configuration change Simple, @@ -870,6 +982,131 @@ impl<'a> ChangePeerI for &'a ChangePeerV2Request { } } +/// Check if the conf change request is valid. +/// +/// The function will try to keep operation safe. In some edge cases (or +/// tests), we may not care about safety. In this case, `ignore_safety` +/// can be set to true. +/// +/// Make sure the peer can serve read and write when ignore safety, otherwise +/// it may produce stale result or cause unavailability. +pub fn check_conf_change( + cfg: &Config, + node: &RawNode, + region: &metapb::Region, + leader: &metapb::Peer, + change_peers: &[ChangePeerRequest], + cc: &impl ConfChangeI, + ignore_safety: bool, +) -> Result<()> { + let current_progress = node.status().progress.unwrap().clone(); + let mut after_progress = current_progress.clone(); + let cc_v2 = cc.as_v2(); + let mut changer = Changer::new(&after_progress); + let (conf, changes) = if cc_v2.leave_joint() { + changer.leave_joint()? + } else if let Some(auto_leave) = cc_v2.enter_joint() { + changer.enter_joint(auto_leave, &cc_v2.changes)? + } else { + changer.simple(&cc_v2.changes)? + }; + after_progress.apply_conf(conf, changes, node.raft.raft_log.last_index()); + + // Because the conf change can be applied successfully above, so the current + // raft group state must matches the command. For example, won't call leave + // joint on a non joint state. + let kind = ConfChangeKind::confchange_kind(change_peers.len()); + if kind == ConfChangeKind::LeaveJoint { + if ignore_safety || leader.get_role() != PeerRole::DemotingVoter { + return Ok(()); + } + return Err(box_err!("ignore leave joint command that demoting leader")); + } + + let mut check_dup = HashSet::default(); + let mut only_learner_change = true; + let current_voter = current_progress.conf().voters().ids(); + for cp in change_peers { + let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); + match (change_type, peer.get_role()) { + (ConfChangeType::RemoveNode, PeerRole::Voter) if kind != ConfChangeKind::Simple => { + return Err(box_err!("{:?}: can not remove voter directly", cp)); + } + (ConfChangeType::RemoveNode, _) + | (ConfChangeType::AddNode, PeerRole::Voter) + | (ConfChangeType::AddLearnerNode, PeerRole::Learner) => {} + _ => { + return Err(box_err!("{:?}: op not match role", cp)); + } + } + + if region + .get_peers() + .iter() + .find(|p| p.get_id() == peer.get_id()) + .map_or(false, |p| p.get_is_witness() != peer.get_is_witness()) + { + return Err(box_err!( + "invalid conf change request: {:?}, can not switch witness in conf change", + cp + )); + } + + if !check_dup.insert(peer.get_id()) { + return Err(box_err!( + "have multiple commands for the same peer {}", + peer.get_id() + )); + } + + if peer.get_id() == leader.get_id() + && (change_type == ConfChangeType::RemoveNode + // In Joint confchange, the leader is allowed to be DemotingVoter + || (kind == ConfChangeKind::Simple + && change_type == ConfChangeType::AddLearnerNode)) + && !cfg.allow_remove_leader() + { + return Err(box_err!("ignore remove leader or demote leader")); + } + + if current_voter.contains(peer.get_id()) || change_type == ConfChangeType::AddNode { + only_learner_change = false; + } + } + + // Multiple changes that only effect learner will not product `IncommingVoter` + // or `DemotingVoter` after apply, but raftstore layer and PD rely on these + // roles to detect joint state + if kind != ConfChangeKind::Simple && only_learner_change { + return Err(box_err!("multiple changes that only effect learner")); + } + + if !ignore_safety { + let promoted_commit_index = after_progress.maximal_committed_index().0; + let first_index = node.raft.raft_log.first_index(); + if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. + || promoted_commit_index + 1 >= first_index + { + return Ok(()); + } + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["conf_change", "reject_unsafe"]) + .inc(); + + Err(box_err!( + "{:?}: before: {:?}, after: {:?}, first index {}, promoted commit index {}", + change_peers, + current_progress.conf().to_conf_state(), + after_progress.conf().to_conf_state(), + first_index, + promoted_commit_index + )) + } else { + Ok(()) + } +} + pub struct MsgType<'a>(pub &'a RaftMessage); impl Display for MsgType<'_> { @@ -921,15 +1158,50 @@ impl RegionReadProgressRegistry { .map(|rp| rp.safe_ts()) } - // Update `safe_ts` with the provided `LeaderInfo` and return the regions that have the - // same `LeaderInfo` - pub fn handle_check_leaders(&self, leaders: Vec) -> Vec { + pub fn get_tracked_index(&self, region_id: &u64) -> Option { + self.registry + .lock() + .unwrap() + .get(region_id) + .map(|rp| rp.core.lock().unwrap().read_state.idx) + } + + // NOTICE: this function is an alias of `get_safe_ts` to distinguish the + // semantics. + pub fn get_resolved_ts(&self, region_id: &u64) -> Option { + self.registry + .lock() + .unwrap() + .get(region_id) + .map(|rp| rp.resolved_ts()) + } + + // Get the minimum `resolved_ts` which could ensure that there will be no more + // locks whose `start_ts` is greater than it. + pub fn get_min_resolved_ts(&self) -> u64 { + self.registry + .lock() + .unwrap() + .iter() + .map(|(_, rrp)| rrp.resolved_ts()) + .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized + .min() + .unwrap_or(0) + } + + // Update `safe_ts` with the provided `LeaderInfo` and return the regions that + // have the same `LeaderInfo` + pub fn handle_check_leaders( + &self, + leaders: Vec, + coprocessor: &CoprocessorHost, + ) -> Vec { let mut regions = Vec::with_capacity(leaders.len()); let registry = self.registry.lock().unwrap(); - for leader_info in leaders { + for leader_info in &leaders { let region_id = leader_info.get_region_id(); if let Some(rp) = registry.get(®ion_id) { - if rp.consume_leader_info(leader_info) { + if rp.consume_leader_info(leader_info, coprocessor) { regions.push(region_id); } } @@ -937,21 +1209,9 @@ impl RegionReadProgressRegistry { regions } - // Get the `LeaderInfo` of the requested regions - pub fn dump_leader_infos(&self, regions: &[u64]) -> HashMap, LeaderInfo)> { - let registry = self.registry.lock().unwrap(); - let mut info_map = HashMap::with_capacity(regions.len()); - for region_id in regions { - if let Some(rrp) = registry.get(region_id) { - info_map.insert(*region_id, rrp.dump_leader_info()); - } - } - info_map - } - - /// Invoke the provided callback with the registry, an internal lock will hold - /// while invoking the callback so it is important that *not* try to acquiring any - /// lock inside the callback to avoid dead lock + /// Invoke the provided callback with the registry, an internal lock will + /// hold while invoking the callback so it is important that *not* try + /// to acquiring any lock inside the callback to avoid dead lock pub fn with(&self, f: F) -> T where F: FnOnce(&HashMap>) -> T, @@ -967,9 +1227,10 @@ impl Default for RegionReadProgressRegistry { } } -/// `RegionReadProgress` is used to keep track of the replica's `safe_ts`, the replica can handle a read -/// request directly without requiring leader lease or read index iff `safe_ts` >= `read_ts` (the `read_ts` -/// is usually stale i.e seconds ago). +/// `RegionReadProgress` is used to keep track of the replica's `safe_ts`, the +/// replica can handle a read request directly without requiring leader lease or +/// read index iff `safe_ts` >= `read_ts` (the `read_ts` is usually stale i.e +/// seconds ago). /// /// `safe_ts` is updated by the `(apply index, safe ts)` item: /// ```ignore @@ -978,13 +1239,15 @@ impl Default for RegionReadProgressRegistry { /// } /// ``` /// -/// For the leader, the `(apply index, safe ts)` item is publish by the `resolved-ts` worker periodically. -/// For the followers, the item is sync periodically from the leader through the `CheckLeader` rpc. +/// For the leader, the `(apply index, safe ts)` item is publish by the +/// `resolved-ts` worker periodically. For the followers, the item is sync +/// periodically from the leader through the `CheckLeader` rpc. /// -/// The intend is to make the item's `safe ts` larger (more up to date) and `apply index` smaller (require less data) +/// The intend is to make the item's `safe ts` larger (more up to date) and +/// `apply index` smaller (require less data) // -/// TODO: the name `RegionReadProgress` is conflict with the leader lease's `ReadProgress`, shoule change it to another -/// more proper name +/// TODO: the name `RegionReadProgress` is conflict with the leader lease's +/// `ReadProgress`, should change it to another more proper name #[derive(Debug)] pub struct RegionReadProgress { // `core` used to keep track and update `safe_ts`, it should @@ -996,14 +1259,46 @@ pub struct RegionReadProgress { } impl RegionReadProgress { - pub fn new(region: &Region, applied_index: u64, cap: usize, tag: String) -> RegionReadProgress { + pub fn new( + region: &Region, + applied_index: u64, + cap: usize, + peer_id: u64, + ) -> RegionReadProgress { RegionReadProgress { - core: Mutex::new(RegionReadProgressCore::new(region, applied_index, cap, tag)), + core: Mutex::new(RegionReadProgressCore::new( + region, + applied_index, + cap, + peer_id, + )), safe_ts: AtomicU64::from(0), } } - pub fn update_applied(&self, applied: u64) { + pub fn update_advance_resolved_ts_notify(&self, advance_notify: Arc) { + self.core.lock().unwrap().advance_notify = Some(advance_notify); + } + + pub fn notify_advance_resolved_ts(&self) { + if let Ok(core) = self.core.try_lock() && let Some(advance_notify) = &core.advance_notify { + advance_notify.notify_waiters(); + } + } + + pub fn update_applied(&self, applied: u64, coprocessor: &CoprocessorHost) { + let mut core = self.core.lock().unwrap(); + if let Some(ts) = core.update_applied(applied) { + if !core.pause { + self.safe_ts.store(ts, AtomicOrdering::Release); + // No need to update leader safe ts here. + coprocessor.on_update_safe_ts(core.region_id, ts, INVALID_TIMESTAMP) + } + } + } + + // TODO: remove it when coprocessor hook is implemented in v2. + pub fn update_applied_core(&self, applied: u64) { let mut core = self.core.lock().unwrap(); if let Some(ts) = core.update_applied(applied) { if !core.pause { @@ -1027,23 +1322,35 @@ impl RegionReadProgress { } } - pub fn merge_safe_ts(&self, source_safe_ts: u64, merge_index: u64) { + pub fn merge_safe_ts( + &self, + source_safe_ts: u64, + merge_index: u64, + coprocessor: &CoprocessorHost, + ) { let mut core = self.core.lock().unwrap(); if let Some(ts) = core.merge_safe_ts(source_safe_ts, merge_index) { if !core.pause { self.safe_ts.store(ts, AtomicOrdering::Release); + // After region merge, self safe ts may decrease, so leader safe ts should be + // reset. + coprocessor.on_update_safe_ts(core.region_id, ts, ts) } } } - // Consume the provided `LeaderInfo` to update `safe_ts` and return whether the provided - // `LeaderInfo` is same as ours - pub fn consume_leader_info(&self, mut leader_info: LeaderInfo) -> bool { + // Consume the provided `LeaderInfo` to update `safe_ts` and return whether the + // provided `LeaderInfo` is same as ours + pub fn consume_leader_info( + &self, + leader_info: &LeaderInfo, + coprocessor: &CoprocessorHost, + ) -> bool { let mut core = self.core.lock().unwrap(); if leader_info.has_read_state() { - // It is okay to update `safe_ts` without checking the `LeaderInfo`, the `read_state` - // is guaranteed to be valid when it is published by the leader - let rs = leader_info.take_read_state(); + // It is okay to update `safe_ts` without checking the `LeaderInfo`, the + // `read_state` is guaranteed to be valid when it is published by the leader + let rs = leader_info.get_read_state(); let (apply_index, ts) = (rs.get_applied_index(), rs.get_safe_ts()); if apply_index != 0 && ts != 0 && !core.discard { if let Some(ts) = core.update_safe_ts(apply_index, ts) { @@ -1052,6 +1359,7 @@ impl RegionReadProgress { } } } + coprocessor.on_update_safe_ts(leader_info.region_id, self.safe_ts(), rs.get_safe_ts()) } // whether the provided `LeaderInfo` is same as ours core.leader_info.leader_term == leader_info.term @@ -1060,24 +1368,12 @@ impl RegionReadProgress { } // Dump the `LeaderInfo` and the peer list - pub fn dump_leader_info(&self) -> (Vec, LeaderInfo) { - let mut leader_info = LeaderInfo::default(); + pub fn dump_leader_info(&self) -> (LeaderInfo, Option) { let core = self.core.lock().unwrap(); - let read_state = { - // Get the latest `read_state` - let ReadState { idx, ts } = core.pending_items.back().unwrap_or(&core.read_state); - let mut rs = kvrpcpb::ReadState::default(); - rs.set_applied_index(*idx); - rs.set_safe_ts(*ts); - rs - }; - let li = &core.leader_info; - leader_info.set_peer_id(li.leader_id); - leader_info.set_term(li.leader_term); - leader_info.set_region_id(core.region_id); - leader_info.set_region_epoch(li.epoch.clone()); - leader_info.set_read_state(read_state); - (li.peers.clone(), leader_info) + ( + core.get_leader_info(), + core.get_local_leader_info().leader_store_id, + ) } pub fn update_leader_info(&self, peer_id: u64, term: u64, region: &Region) { @@ -1086,8 +1382,14 @@ impl RegionReadProgress { core.leader_info.leader_term = term; if !is_region_epoch_equal(region.get_region_epoch(), &core.leader_info.epoch) { core.leader_info.epoch = region.get_region_epoch().clone(); + } + if core.leader_info.peers != region.get_peers() { + // In v2, we check peers and region epoch independently, because + // peers are incomplete but epoch is set correctly during split. core.leader_info.peers = region.get_peers().to_vec(); } + core.leader_info.leader_store_id = + find_store_id(&core.leader_info.peers, core.leader_info.leader_id) } /// Reset `safe_ts` to 0 and stop updating it @@ -1116,31 +1418,46 @@ impl RegionReadProgress { pub fn safe_ts(&self) -> u64 { self.safe_ts.load(AtomicOrdering::Acquire) } + + // `safe_ts` is calculated from the `resolved_ts`, they are the same thing + // internally. So we can use `resolved_ts` as the alias of `safe_ts` here. + #[inline(always)] + pub fn resolved_ts(&self) -> u64 { + self.safe_ts() + } + + // Dump the `LeaderInfo` and the peer list + pub fn get_core(&self) -> MutexGuard<'_, RegionReadProgressCore> { + self.core.lock().unwrap() + } } #[derive(Debug)] -struct RegionReadProgressCore { - tag: String, +pub struct RegionReadProgressCore { + peer_id: u64, region_id: u64, applied_index: u64, - // A wraper of `(apply_index, safe_ts)` item, where the `read_state.ts` is the peer's current `safe_ts` - // and the `read_state.idx` is the smallest `apply_index` required for that `safe_ts` + // A wrapper of `(apply_index, safe_ts)` item, where the `read_state.ts` is the peer's current + // `safe_ts` and the `read_state.idx` is the smallest `apply_index` required for that `safe_ts` read_state: ReadState, // The local peer's acknowledge about the leader leader_info: LocalLeaderInfo, // `pending_items` is a *sorted* list of `(apply_index, safe_ts)` item pending_items: VecDeque, - // After the region commit merged, the region's key range is extended and the region's `safe_ts` - // should reset to `min(source_safe_ts, target_safe_ts)`, and start reject stale `read_state` item - // with index smaller than `last_merge_index` to avoid `safe_ts` undo the decrease + // After the region commit merged, the region's key range is extended and the region's + // `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)`, and start reject stale + // `read_state` item with index smaller than `last_merge_index` to avoid `safe_ts` undo the + // decrease last_merge_index: u64, // Stop update `safe_ts` pause: bool, // Discard incoming `(idx, ts)` discard: bool, + // A notify to trigger advancing resolved ts immediately. + advance_notify: Option>, } -// A helpful wraper of `(apply_index, safe_ts)` item +// A helpful wrapper of `(apply_index, safe_ts)` item #[derive(Clone, Debug, Default)] pub struct ReadState { pub idx: u64, @@ -1152,6 +1469,7 @@ pub struct ReadState { pub struct LocalLeaderInfo { leader_id: u64, leader_term: u64, + leader_store_id: Option, epoch: RegionEpoch, peers: Vec, } @@ -1161,24 +1479,54 @@ impl LocalLeaderInfo { LocalLeaderInfo { leader_id: raft::INVALID_ID, leader_term: 0, + leader_store_id: None, epoch: region.get_region_epoch().clone(), peers: region.get_peers().to_vec(), } } + + pub fn get_peers(&self) -> &[Peer] { + &self.peers + } + + pub fn get_leader_id(&self) -> u64 { + self.leader_id + } + + pub fn get_leader_store_id(&self) -> Option { + self.leader_store_id + } +} + +fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { + for peer in peer_list { + if peer.id == peer_id { + return Some(peer.store_id); + } + } + None } impl RegionReadProgressCore { - fn new(region: &Region, applied_index: u64, cap: usize, tag: String) -> RegionReadProgressCore { + fn new( + region: &Region, + applied_index: u64, + cap: usize, + peer_id: u64, + ) -> RegionReadProgressCore { + // forbids stale read for witness + let is_witness = find_peer_by_id(region, peer_id).map_or(false, |p| p.is_witness); RegionReadProgressCore { - tag, + peer_id, region_id: region.get_id(), applied_index, read_state: ReadState::default(), leader_info: LocalLeaderInfo::new(region), pending_items: VecDeque::with_capacity(cap), last_merge_index: 0, - pause: false, - discard: false, + pause: is_witness, + discard: is_witness, + advance_notify: None, } } @@ -1193,10 +1541,11 @@ impl RegionReadProgressCore { self.read_state.ts = cmp::min(source_safe_ts, target_safe_ts); info!( "reset safe_ts due to merge"; - "tag" => &self.tag, "source_safe_ts" => source_safe_ts, "target_safe_ts" => target_safe_ts, "safe_ts" => self.read_state.ts, + "region_id" => self.region_id, + "peer_id" => self.peer_id, ); if self.read_state.ts != target_safe_ts { Some(self.read_state.ts) @@ -1210,7 +1559,8 @@ impl RegionReadProgressCore { // The apply index should not decrease assert!(applied >= self.applied_index); self.applied_index = applied; - // Consume pending items with `apply_index` less or equal to `self.applied_index` + // Consume pending items with `apply_index` less or equal to + // `self.applied_index` let mut to_update = self.read_state.clone(); while let Some(item) = self.pending_items.pop_front() { if self.applied_index < item.idx { @@ -1277,9 +1627,35 @@ impl RegionReadProgressCore { } self.pending_items.push_back(item); } + + pub fn get_leader_info(&self) -> LeaderInfo { + let read_state = { + // Get the latest `read_state` + let ReadState { idx, ts } = self.pending_items.back().unwrap_or(&self.read_state); + let mut rs = kvrpcpb::ReadState::default(); + rs.set_applied_index(*idx); + rs.set_safe_ts(*ts); + rs + }; + let li = &self.leader_info; + LeaderInfo { + peer_id: li.leader_id, + region_id: self.region_id, + term: li.leader_term, + region_epoch: protobuf::SingularPtrField::some(li.epoch.clone()), + read_state: protobuf::SingularPtrField::some(read_state), + unknown_fields: protobuf::UnknownFields::default(), + cached_size: protobuf::CachedSize::default(), + } + } + + pub fn get_local_leader_info(&self) -> &LocalLeaderInfo { + &self.leader_info + } } -/// Represent the duration of all stages of raftstore recorded by one inspecting. +/// Represent the duration of all stages of raftstore recorded by one +/// inspecting. #[derive(Default, Debug)] pub struct RaftstoreDuration { pub store_wait_duration: Option, @@ -1341,16 +1717,59 @@ impl LatencyInspector { } } +pub fn validate_split_region( + region_id: u64, + peer_id: u64, + region: &Region, + epoch: &RegionEpoch, + split_keys: &[Vec], +) -> Result<()> { + if split_keys.is_empty() { + return Err(box_err!( + "[region {}] {} no split key is specified.", + region_id, + peer_id + )); + } + + let latest_epoch = region.get_region_epoch(); + // This is a little difference for `check_region_epoch` in region split case. + // Here we just need to check `version` because `conf_ver` will be update + // to the latest value of the peer, and then send to PD. + if latest_epoch.get_version() != epoch.get_version() { + return Err(Error::EpochNotMatch( + format!( + "[region {}] {} epoch changed {:?} != {:?}, retry later", + region_id, peer_id, latest_epoch, epoch + ), + vec![region.to_owned()], + )); + } + for key in split_keys { + if key.is_empty() { + return Err(box_err!( + "[region {}] {} split key should not be empty", + region_id, + peer_id + )); + } + check_key_in_region(key, region)?; + } + Ok(()) +} + #[cfg(test)] mod tests { use std::thread; + use engine_test::kv::KvTestEngine; use kvproto::{ metapb::{self, RegionEpoch}, raft_cmdpb::AdminRequest, }; + use protobuf::Message as _; use raft::eraftpb::{ConfChangeType, Entry, Message, MessageType}; - use tikv_util::time::monotonic_raw_now; + use tikv_util::store::new_peer; use time::Duration as TimeDuration; use super::*; @@ -1427,12 +1846,27 @@ mod tests { assert_eq!(m1.inspect(Some(monotonic_raw_now())), LeaseState::Valid); } + #[test] + fn test_get_entry_header() { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("test".to_owned()); + req.set_header(header); + let mut entry = Entry::new(); + entry.set_term(1); + entry.set_index(2); + entry.set_data(req.write_to_bytes().unwrap().into()); + let header = get_entry_header(&entry); + assert_eq!(header.get_resource_group_name(), "test"); + } + #[test] fn test_timespec_u64() { let cases = vec![ (Timespec::new(0, 0), 0x0000_0000_0000_0000u64), (Timespec::new(0, 1), 0x0000_0000_0000_0000u64), // 1ns is round down to 0ms. - (Timespec::new(0, 999_999), 0x0000_0000_0000_0000u64), // 999_999ns is round down to 0ms. + (Timespec::new(0, 999_999), 0x0000_0000_0000_0000u64), /* 999_999ns is round down to + * 0ms. */ ( // 1_048_575ns is round down to 0ms. Timespec::new(0, 1_048_575 /* 0x0FFFFF */), @@ -1484,34 +1918,6 @@ mod tests { } } - // Tests the util function `check_key_in_region`. - #[test] - fn test_check_key_in_region() { - let test_cases = vec![ - ("", "", "", true, true, false), - ("", "", "6", true, true, false), - ("", "3", "6", false, false, false), - ("4", "3", "6", true, true, true), - ("4", "3", "", true, true, true), - ("3", "3", "", true, true, false), - ("2", "3", "6", false, false, false), - ("", "3", "6", false, false, false), - ("", "3", "", false, false, false), - ("6", "3", "6", false, true, false), - ]; - for (key, start_key, end_key, is_in_region, inclusive, exclusive) in test_cases { - let mut region = metapb::Region::default(); - region.set_start_key(start_key.as_bytes().to_vec()); - region.set_end_key(end_key.as_bytes().to_vec()); - let mut result = check_key_in_region(key.as_bytes(), ®ion); - assert_eq!(result.is_ok(), is_in_region); - result = check_key_in_region_inclusive(key.as_bytes(), ®ion); - assert_eq!(result.is_ok(), inclusive); - result = check_key_in_region_exclusive(key.as_bytes(), ®ion); - assert_eq!(result.is_ok(), exclusive); - } - } - fn gen_region( voters: &[u64], learners: &[u64], @@ -1520,7 +1926,7 @@ mod tests { ) -> metapb::Region { let mut region = metapb::Region::default(); macro_rules! push_peer { - ($ids: ident, $role: expr) => { + ($ids:ident, $role:expr) => { for id in $ids { let mut peer = metapb::Peer::default(); peer.set_id(*id); @@ -1607,21 +2013,6 @@ mod tests { ); } - #[test] - fn test_peer() { - let mut region = metapb::Region::default(); - region.set_id(1); - region.mut_peers().push(new_peer(1, 1)); - region.mut_peers().push(new_learner_peer(2, 2)); - - assert!(!is_learner(find_peer(®ion, 1).unwrap())); - assert!(is_learner(find_peer(®ion, 2).unwrap())); - - assert!(remove_peer(&mut region, 1).is_some()); - assert!(remove_peer(&mut region, 1).is_none()); - assert!(find_peer(®ion, 1).is_none()); - } - #[test] fn test_first_vote_msg() { let tbl = vec![ @@ -1744,40 +2135,6 @@ mod tests { } } - #[test] - fn test_on_same_store() { - let cases = vec![ - (vec![2, 3, 4], vec![], vec![1, 2, 3], vec![], false), - (vec![2, 3, 1], vec![], vec![1, 2, 3], vec![], true), - (vec![2, 3, 4], vec![], vec![1, 2], vec![], false), - (vec![1, 2, 3], vec![], vec![1, 2, 3], vec![], true), - (vec![1, 3], vec![2, 4], vec![1, 2], vec![3, 4], false), - (vec![1, 3], vec![2, 4], vec![1, 3], vec![], false), - (vec![1, 3], vec![2, 4], vec![], vec![2, 4], false), - (vec![1, 3], vec![2, 4], vec![3, 1], vec![4, 2], true), - ]; - - for (s1, s2, s3, s4, exp) in cases { - let mut r1 = metapb::Region::default(); - for (store_id, peer_id) in s1.into_iter().zip(0..) { - r1.mut_peers().push(new_peer(store_id, peer_id)); - } - for (store_id, peer_id) in s2.into_iter().zip(0..) { - r1.mut_peers().push(new_learner_peer(store_id, peer_id)); - } - - let mut r2 = metapb::Region::default(); - for (store_id, peer_id) in s3.into_iter().zip(10..) { - r2.mut_peers().push(new_peer(store_id, peer_id)); - } - for (store_id, peer_id) in s4.into_iter().zip(10..) { - r2.mut_peers().push(new_learner_peer(store_id, peer_id)); - } - let res = super::region_on_same_stores(&r1, &r2); - assert_eq!(res, exp, "{:?} vs {:?}", r1, r2); - } - } - fn split(mut r: metapb::Region, key: &[u8]) -> (metapb::Region, metapb::Region) { let mut r2 = r.clone(); r.set_end_key(key.to_owned()); @@ -1813,34 +2170,34 @@ mod tests { #[test] fn test_check_store_id() { - let mut req = RaftCmdRequest::default(); - req.mut_header().mut_peer().set_store_id(1); - check_store_id(&req, 1).unwrap(); - check_store_id(&req, 2).unwrap_err(); + let mut header = RaftRequestHeader::default(); + header.mut_peer().set_store_id(1); + check_store_id(&header, 1).unwrap(); + check_store_id(&header, 2).unwrap_err(); } #[test] fn test_check_peer_id() { - let mut req = RaftCmdRequest::default(); - req.mut_header().mut_peer().set_id(1); - check_peer_id(&req, 1).unwrap(); - check_peer_id(&req, 2).unwrap_err(); + let mut header = RaftRequestHeader::default(); + header.mut_peer().set_id(1); + check_peer_id(&header, 1).unwrap(); + check_peer_id(&header, 2).unwrap_err(); } #[test] fn test_check_term() { - let mut req = RaftCmdRequest::default(); - req.mut_header().set_term(7); - check_term(&req, 7).unwrap(); - check_term(&req, 8).unwrap(); + let mut header = RaftRequestHeader::default(); + header.set_term(7); + check_term(&header, 7).unwrap(); + check_term(&header, 8).unwrap(); // If header's term is 2 verions behind current term, // leadership may have been changed away. - check_term(&req, 9).unwrap_err(); - check_term(&req, 10).unwrap_err(); + check_term(&header, 9).unwrap_err(); + check_term(&header, 10).unwrap_err(); } #[test] - fn test_check_region_epoch() { + fn test_check_req_region_epoch() { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(2); epoch.set_version(2); @@ -1848,7 +2205,7 @@ mod tests { region.set_region_epoch(epoch.clone()); // Epoch is required for most requests even if it's empty. - check_region_epoch(&RaftCmdRequest::default(), ®ion, false).unwrap_err(); + check_req_region_epoch(&RaftCmdRequest::default(), ®ion, false).unwrap_err(); // These admin commands do not require epoch. for ty in &[ @@ -1863,11 +2220,11 @@ mod tests { req.set_admin_request(admin); // It is Okay if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap(); + check_req_region_epoch(&req, ®ion, false).unwrap(); req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, true).unwrap(); - check_region_epoch(&req, ®ion, false).unwrap(); + check_req_region_epoch(&req, ®ion, true).unwrap(); + check_req_region_epoch(&req, ®ion, false).unwrap(); } // These admin commands requires epoch.version. @@ -1885,7 +2242,7 @@ mod tests { req.set_admin_request(admin); // Error if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); let mut stale_version_epoch = epoch.clone(); stale_version_epoch.set_version(1); @@ -1893,14 +2250,14 @@ mod tests { stale_region.set_region_epoch(stale_version_epoch.clone()); req.mut_header() .set_region_epoch(stale_version_epoch.clone()); - check_region_epoch(&req, &stale_region, false).unwrap(); + check_req_region_epoch(&req, &stale_region, false).unwrap(); let mut latest_version_epoch = epoch.clone(); latest_version_epoch.set_version(3); for epoch in &[stale_version_epoch, latest_version_epoch] { req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, false).unwrap_err(); - check_region_epoch(&req, ®ion, true).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, true).unwrap_err(); } } @@ -1921,21 +2278,21 @@ mod tests { req.set_admin_request(admin); // Error if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); let mut stale_conf_epoch = epoch.clone(); stale_conf_epoch.set_conf_ver(1); let mut stale_region = metapb::Region::default(); stale_region.set_region_epoch(stale_conf_epoch.clone()); req.mut_header().set_region_epoch(stale_conf_epoch.clone()); - check_region_epoch(&req, &stale_region, false).unwrap(); + check_req_region_epoch(&req, &stale_region, false).unwrap(); let mut latest_conf_epoch = epoch.clone(); latest_conf_epoch.set_conf_ver(3); for epoch in &[stale_conf_epoch, latest_conf_epoch] { req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, false).unwrap_err(); - check_region_epoch(&req, ®ion, true).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, true).unwrap_err(); } } } @@ -1957,7 +2314,8 @@ mod tests { } let cap = 10; - let rrp = RegionReadProgress::new(&Default::default(), 10, cap, "".to_owned()); + let mut region = Region::default(); + let rrp = RegionReadProgress::new(®ion, 10, cap, 1); for i in 1..=20 { rrp.update_safe_ts(i, i); } @@ -1965,7 +2323,8 @@ mod tests { assert_eq!(rrp.safe_ts(), 10); assert_eq!(pending_items_num(&rrp), 10); - rrp.update_applied(20); + let coprocessor_host = CoprocessorHost::::default(); + rrp.update_applied(20, &coprocessor_host); assert_eq!(rrp.safe_ts(), 20); assert_eq!(pending_items_num(&rrp), 0); @@ -1977,7 +2336,7 @@ mod tests { assert!(pending_items_num(&rrp) <= cap); // `applied_index` large than all pending items will clear all pending items - rrp.update_applied(200); + rrp.update_applied(200, &coprocessor_host); assert_eq!(rrp.safe_ts(), 199); assert_eq!(pending_items_num(&rrp), 0); @@ -1991,9 +2350,9 @@ mod tests { rrp.update_safe_ts(301, 600); assert_eq!(pending_items_num(&rrp), 2); // `safe_ts` will update to 500 instead of 300 - rrp.update_applied(300); + rrp.update_applied(300, &coprocessor_host); assert_eq!(rrp.safe_ts(), 500); - rrp.update_applied(301); + rrp.update_applied(301, &coprocessor_host); assert_eq!(rrp.safe_ts(), 600); assert_eq!(pending_items_num(&rrp), 0); @@ -2003,5 +2362,20 @@ mod tests { rrp.update_safe_ts(400, 0); rrp.update_safe_ts(0, 700); assert_eq!(pending_items_num(&rrp), 0); + + // update leader info, epoch + region.mut_region_epoch().version += 1; + rrp.update_leader_info(1, 5, ®ion); + assert_eq!( + rrp.core.lock().unwrap().get_local_leader_info().epoch, + *region.get_region_epoch(), + ); + // update leader info, peers + region.mut_peers().push(new_peer(1, 2)); + rrp.update_leader_info(1, 5, ®ion); + assert_eq!( + rrp.core.lock().unwrap().get_local_leader_info().peers, + *region.get_peers(), + ); } } diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index d5fd6f2c007..c4646de35a4 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -1,21 +1,27 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::Bound::{Excluded, Unbounded}, fmt, sync::{Arc, Mutex}, }; +use engine_traits::KvEngine; use fail::fail_point; -use keys::{data_end_key, data_key, enc_start_key}; use kvproto::kvrpcpb::{KeyRange, LeaderInfo}; use tikv_util::worker::Runnable; -use crate::store::{fsm::store::StoreMeta, util::RegionReadProgressRegistry}; +use crate::{ + coprocessor::CoprocessorHost, + store::{fsm::store::StoreRegionMeta, util::RegionReadProgressRegistry}, +}; -pub struct Runner { - store_meta: Arc>, +pub struct Runner +where + E: KvEngine, +{ + store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, + coprocessor: CoprocessorHost, } pub enum Task { @@ -47,16 +53,22 @@ impl fmt::Display for Task { } } -impl Runner { - pub fn new(store_meta: Arc>) -> Runner { - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); +impl Runner +where + S: StoreRegionMeta, + E: KvEngine, +{ + pub fn new(store_meta: Arc>, coprocessor: CoprocessorHost) -> Self { + let region_read_progress = store_meta.lock().unwrap().region_read_progress().clone(); Runner { region_read_progress, store_meta, + coprocessor, } } - // Get the minimal `safe_ts` from regions overlap with the key range [`start_key`, `end_key`) + // Get the minimal `safe_ts` from regions overlap with the key range + // [`start_key`, `end_key`) fn get_range_safe_ts(&self, key_range: KeyRange) -> u64 { if key_range.get_start_key().is_empty() && key_range.get_end_key().is_empty() { // Fast path to get the min `safe_ts` of all regions in this store @@ -69,48 +81,44 @@ impl Runner { .unwrap_or(0) }) } else { - let (start_key, end_key) = ( - data_key(key_range.get_start_key()), - data_end_key(key_range.get_end_key()), - ); - // `store_safe_ts` won't be accessed frequently (like per-request or per-transaction), - // also this branch won't entry because the request key range is empty currently (in v5.1) - // keep this branch for robustness and future use, so it is okay getting `store_safe_ts` - // from `store_meta` (behide a mutex) + // `store_safe_ts` won't be accessed frequently (like per-request or + // per-transaction), also this branch won't entry because the request key range + // is empty currently (in v5.1) keep this branch for robustness and future use, + // so it is okay getting `store_safe_ts` from `store_meta` (behide a mutex) let meta = self.store_meta.lock().unwrap(); - meta.region_read_progress.with(|registry| { - meta.region_ranges - // get overlapped regions - .range((Excluded(start_key), Unbounded)) - .take_while(|(_, id)| end_key > enc_start_key(&meta.regions[id])) - // get the min `safe_ts` - .map(|(_, id)| { - registry.get(id).unwrap().safe_ts() - }) - .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized - .min() - .unwrap_or(0) + meta.region_read_progress().with(|registry| { + let mut min_ts = u64::MAX; + meta.search_region(key_range.get_start_key(), key_range.get_end_key(), |r| { + let ts = registry.get(&r.get_id()).unwrap().safe_ts(); + // ts == 0 means the peer is uninitialized + if ts != 0 && ts < min_ts { + min_ts = ts; + } + }); + if min_ts == u64::MAX { 0 } else { min_ts } }) } } } -impl Runnable for Runner { +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { match task { Task::CheckLeader { leaders, cb } => { fail_point!( "before_check_leader_store_2", - self.store_meta.lock().unwrap().store_id == Some(2), + self.store_meta.lock().unwrap().store_id() == 2, |_| {} ); fail_point!( "before_check_leader_store_3", - self.store_meta.lock().unwrap().store_id == Some(3), + self.store_meta.lock().unwrap().store_id() == 3, |_| {} ); - let regions = self.region_read_progress.handle_check_leaders(leaders); + let regions = self + .region_read_progress + .handle_check_leaders(leaders, &self.coprocessor); cb(regions); } Task::GetStoreTs { key_range, cb } => { @@ -123,11 +131,12 @@ impl Runnable for Runner { #[cfg(test)] mod tests { + use engine_test::kv::KvTestEngine; use keys::enc_end_key; use kvproto::metapb::Region; use super::*; - use crate::store::util::RegionReadProgress; + use crate::store::{fsm::StoreMeta, util::RegionReadProgress}; #[test] fn test_get_range_min_safe_ts() { @@ -138,7 +147,7 @@ mod tests { region.set_start_key(kr.get_start_key().to_vec()); region.set_end_key(kr.get_end_key().to_vec()); region.set_peers(vec![kvproto::metapb::Peer::default()].into()); - let rrp = RegionReadProgress::new(®ion, 1, 1, "".to_owned()); + let rrp = RegionReadProgress::new(®ion, 1, 1, 1); rrp.update_safe_ts(1, safe_ts); assert_eq!(rrp.safe_ts(), safe_ts); meta.region_ranges.insert(enc_end_key(®ion), id); @@ -154,7 +163,8 @@ mod tests { } let meta = Arc::new(Mutex::new(StoreMeta::new(0))); - let runner = Runner::new(meta.clone()); + let coprocessor_host = CoprocessorHost::::default(); + let runner = Runner::new(meta.clone(), coprocessor_host); assert_eq!(0, runner.get_range_safe_ts(key_range(b"", b""))); add_region(&meta, 1, key_range(b"", b"k1"), 100); assert_eq!(100, runner.get_range_safe_ts(key_range(b"", b""))); diff --git a/components/raftstore/src/store/worker/cleanup_snapshot.rs b/components/raftstore/src/store/worker/cleanup_snapshot.rs index 07d2ac001d4..c84d6ddb4d3 100644 --- a/components/raftstore/src/store/worker/cleanup_snapshot.rs +++ b/components/raftstore/src/store/worker/cleanup_snapshot.rs @@ -25,7 +25,7 @@ pub enum Task { impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match &*self { + match self { Task::GcSnapshot => write!(f, "Gc Snapshot"), Task::DeleteSnapshotFiles { key, .. } => write!(f, "Delete Snapshot Files for {}", key), } diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index afa4d609da1..7bc7052b277 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -23,9 +23,12 @@ pub enum Task { }, CheckAndCompact { - cf_names: Vec, // Column families need to compact - ranges: Vec, // Ranges need to check - tombstones_num_threshold: u64, // The minimum RocksDB tombstones a range that need compacting has + // Column families need to compact + cf_names: Vec, + // Ranges need to check + ranges: Vec, + // The minimum RocksDB tombstones a range that need compacting has + tombstones_num_threshold: u64, tombstones_percent_threshold: u64, }, } @@ -105,7 +108,7 @@ where .start_coarse_timer(); box_try!( self.engine - .compact_range(cf_name, start_key, end_key, false, 1 /* threads */,) + .compact_range_cf(cf_name, start_key, end_key, false, 1 /* threads */,) ); compact_range_timer.observe_duration(); info!( @@ -181,7 +184,8 @@ fn need_compact( return false; } - // When the number of tombstones exceed threshold and ratio, this range need compacting. + // When the number of tombstones exceed threshold and ratio, this range need + // compacting. let estimate_num_del = num_entires - num_versions; estimate_num_del >= tombstones_num_threshold && estimate_num_del * 100 >= tombstones_percent_threshold * num_entires @@ -193,14 +197,15 @@ fn collect_ranges_need_compact( tombstones_num_threshold: u64, tombstones_percent_threshold: u64, ) -> Result, Error> { - // Check the SST properties for each range, and TiKV will compact a range if the range - // contains too many RocksDB tombstones. TiKV will merge multiple neighboring ranges - // that need compacting into a single range. + // Check the SST properties for each range, and TiKV will compact a range if the + // range contains too many RocksDB tombstones. TiKV will merge multiple + // neighboring ranges that need compacting into a single range. let mut ranges_need_compact = VecDeque::new(); let mut compact_start = None; let mut compact_end = None; for range in ranges.windows(2) { - // Get total entries and total versions in this range and checks if it needs to be compacted. + // Get total entries and total versions in this range and checks if it needs to + // be compacted. if let Some((num_ent, num_ver)) = box_try!(engine.get_range_entries_and_versions(CF_WRITE, &range[0], &range[1])) { @@ -220,7 +225,8 @@ fn collect_ranges_need_compact( } } - // Current range doesn't need compacting, save previous range that need compacting. + // Current range doesn't need compacting, save previous range that need + // compacting. if compact_start.is_some() { assert!(compact_end.is_some()); } @@ -247,7 +253,7 @@ mod tests { use std::{thread::sleep, time::Duration}; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions}, + ctor::{CfOptions, DbOptions}, kv::{new_engine, new_engine_opt, KvTestEngine}, }; use engine_traits::{ @@ -266,7 +272,7 @@ mod tests { .prefix("compact-range-test") .tempdir() .unwrap(); - let db = new_engine(path.path().to_str().unwrap(), None, &[CF_DEFAULT], None).unwrap(); + let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); let mut runner = Runner::new(db.clone()); @@ -319,14 +325,14 @@ mod tests { } fn open_db(path: &str) -> KvTestEngine { - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(8); let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new(CF_RAFT, ColumnFamilyOptions::new()), - CFOptions::new(CF_LOCK, ColumnFamilyOptions::new()), - CFOptions::new(CF_WRITE, cf_opts), + (CF_DEFAULT, CfOptions::new()), + (CF_RAFT, CfOptions::new()), + (CF_LOCK, CfOptions::new()), + (CF_WRITE, cf_opts), ]; new_engine_opt(path, db_opts, cfs_opts).unwrap() } diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index dfd2b527168..fef2bae332c 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -9,8 +9,8 @@ use tikv_util::{error, info, warn, worker::Runnable}; use super::metrics::*; use crate::{ - coprocessor::CoprocessorHost, - store::{metrics::*, CasualMessage, CasualRouter}, + coprocessor::{dispatcher::StoreHandle, CoprocessorHost}, + store::metrics::*, }; /// Consistency checking task. @@ -44,12 +44,12 @@ impl Display for Task { } } -pub struct Runner> { +pub struct Runner { router: C, coprocessor_host: CoprocessorHost, } -impl> Runner { +impl Runner { pub fn new(router: C, cop_host: CoprocessorHost) -> Runner { Runner { router, @@ -85,18 +85,8 @@ impl> Runner { for (ctx, sum) in hashes { let mut checksum = Vec::with_capacity(4); checksum.write_u32::(sum).unwrap(); - let msg = CasualMessage::ComputeHashResult { - index, - context: ctx, - hash: checksum, - }; - if let Err(e) = self.router.send(region.get_id(), msg) { - warn!( - "failed to send hash compute result"; - "region_id" => region.get_id(), - "err" => %e, - ); - } + self.router + .update_compute_hash_result(region.get_id(), index, ctx, checksum); } timer.observe_duration(); @@ -106,7 +96,7 @@ impl> Runner { impl Runnable for Runner where EK: KvEngine, - C: CasualRouter, + C: StoreHandle, { type Task = Task; @@ -124,30 +114,25 @@ where #[cfg(test)] mod tests { - use std::{sync::mpsc, time::Duration}; + use std::{assert_matches::assert_matches, sync::mpsc, time::Duration}; use byteorder::{BigEndian, WriteBytesExt}; use engine_test::kv::{new_engine, KvTestEngine}; - use engine_traits::{KvEngine, SyncMutable, CF_DEFAULT, CF_RAFT}; + use engine_traits::{KvEngine, SyncMutable, ALL_CFS}; use kvproto::metapb::*; use tempfile::Builder; use tikv_util::worker::Runnable; use super::*; use crate::coprocessor::{ - BoxConsistencyCheckObserver, ConsistencyCheckMethod, RawConsistencyCheckObserver, + dispatcher::SchedTask, BoxConsistencyCheckObserver, ConsistencyCheckMethod, + RawConsistencyCheckObserver, }; #[test] fn test_consistency_check() { let path = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let db = new_engine( - path.path().to_str().unwrap(), - None, - &[CF_DEFAULT, CF_RAFT], - None, - ) - .unwrap(); + let db = new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let mut region = Region::default(); region.mut_peers().push(Peer::default()); @@ -183,21 +168,8 @@ mod tests { checksum_bytes.write_u32::(sum).unwrap(); let res = rx.recv_timeout(Duration::from_secs(3)).unwrap(); - match res { - ( - region_id, - CasualMessage::ComputeHashResult { - index, - hash, - context, - }, - ) => { - assert_eq!(region_id, region.get_id()); - assert_eq!(index, 10); - assert_eq!(context, vec![0]); - assert_eq!(hash, checksum_bytes); - } - e => panic!("unexpected {:?}", e), - } + assert_matches!(res, SchedTask::UpdateComputeHashResult { region_id, index, hash, context} if + region_id == region.get_id() && index == 10 && context == vec![0] && hash == checksum_bytes + ); } } diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 75ffc17c72b..36a217be607 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -1,17 +1,27 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +use std::{cell::RefCell, time::Duration}; + use lazy_static::lazy_static; -use prometheus::*; +use prometheus::{local::LocalIntCounter, *}; use prometheus_static_metric::*; +use tikv_util::time::Instant; make_auto_flush_static_metric! { pub label_enum SnapType { generate, - apply, + apply } + // snapshot task status + // |all---------start--------------| + // | + // | + // V + // |success|abort|fail|delay|ignore| pub label_enum SnapStatus { all, + start, success, abort, fail, @@ -44,14 +54,58 @@ make_static_metric! { epoch, applied_term, channel_full, + cache_miss, safe_ts, + witness, + flashback_not_prepared, + flashback_in_progress, + wait_data, } - pub struct ReadRejectCounter : IntCounter { - "reason" => RejectReason + pub struct LocalReadRejectCounter : LocalIntCounter { + "reason" => RejectReason, } } +pub struct LocalReadMetrics { + pub local_executed_requests: LocalIntCounter, + pub local_executed_stale_read_requests: LocalIntCounter, + pub local_executed_snapshot_cache_hit: LocalIntCounter, + pub reject_reason: LocalReadRejectCounter, + pub renew_lease_advance: LocalIntCounter, + last_flush_time: Instant, +} + +thread_local! { + pub static TLS_LOCAL_READ_METRICS: RefCell = RefCell::new( + LocalReadMetrics { + local_executed_requests: LOCAL_READ_EXECUTED_REQUESTS.local(), + local_executed_stale_read_requests: LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.local(), + local_executed_snapshot_cache_hit: LOCAL_READ_EXECUTED_CACHE_REQUESTS.local(), + reject_reason: LocalReadRejectCounter::from(&LOCAL_READ_REJECT_VEC), + renew_lease_advance: LOCAL_READ_RENEW_LEASE_ADVANCE_COUNTER.local(), + last_flush_time: Instant::now_coarse(), + } + ); +} + +const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s + +pub fn maybe_tls_local_read_metrics_flush() { + TLS_LOCAL_READ_METRICS.with(|m| { + let mut m = m.borrow_mut(); + + if m.last_flush_time.saturating_elapsed() >= Duration::from_millis(METRICS_FLUSH_INTERVAL) { + m.local_executed_requests.flush(); + m.local_executed_stale_read_requests.flush(); + m.local_executed_snapshot_cache_hit.flush(); + m.reject_reason.flush(); + m.renew_lease_advance.flush(); + m.last_flush_time = Instant::now_coarse(); + } + }); +} + lazy_static! { pub static ref SNAP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_raftstore_snapshot_total", @@ -72,7 +126,7 @@ lazy_static! { "tikv_raftstore_snapshot_duration_seconds", "Bucketed histogram of raftstore snapshot process duration", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref SNAP_HISTOGRAM: SnapHistogram = @@ -80,7 +134,7 @@ lazy_static! { pub static ref CHECK_SPILT_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_check_split_duration_seconds", "Bucketed histogram of raftstore split check duration", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref COMPACT_RANGE_CF: HistogramVec = register_histogram_vec!( @@ -111,8 +165,6 @@ lazy_static! { &["reason"] ) .unwrap(); - pub static ref LOCAL_READ_REJECT: ReadRejectCounter = - ReadRejectCounter::from(&LOCAL_READ_REJECT_VEC); pub static ref LOCAL_READ_EXECUTED_REQUESTS: IntCounter = register_int_counter!( "tikv_raftstore_local_read_executed_requests", "Total number of requests directly executed by local reader." @@ -139,12 +191,6 @@ lazy_static! { "Total number of seek operations from raft log gc." ) .unwrap(); - pub static ref RAFT_LOG_GC_DELETED_KEYS_HISTOGRAM: Histogram = register_histogram!( - "tikv_raftstore_raft_log_gc_deleted_keys", - "Bucket of number of deleted keys from raft log gc.", - exponential_buckets(1.0, 2.0, 20).unwrap() - ) - .unwrap(); pub static ref RAFT_LOG_GC_FAILED: IntCounter = register_int_counter!( "tikv_raftstore_raft_log_gc_failed", "Total number of failed raft log gc." diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index a2ac27eed38..62d27b2e88b 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -6,10 +6,8 @@ mod cleanup_snapshot; mod cleanup_sst; mod compact; mod consistency_check; -mod metrics; +pub mod metrics; mod pd; -mod query_stats; -mod raftlog_fetch; mod raftlog_gc; mod read; mod refresh_config; @@ -18,6 +16,8 @@ mod split_check; mod split_config; mod split_controller; +#[cfg(test)] +pub use self::region::tests::make_raftstore_cfg as make_region_worker_raftstore_cfg; pub use self::{ check_leader::{Runner as CheckLeaderRunner, Task as CheckLeaderTask}, cleanup::{Runner as CleanupRunner, Task as CleanupTask}, @@ -27,12 +27,15 @@ pub use self::{ consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, - Runner as PdRunner, Task as PdTask, + Runner as PdRunner, StatsMonitor as PdStatsMonitor, StoreStatsReporter, Task as PdTask, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, - query_stats::QueryStats, - raftlog_fetch::{Runner as RaftlogFetchRunner, Task as RaftlogFetchTask}, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, - read::{LocalReader, Progress as ReadProgress, ReadDelegate, ReadExecutor, TrackVer}, + read::{ + CachedReadDelegate, LocalReadContext, LocalReader, LocalReaderCore, + Progress as ReadProgress, ReadDelegate, ReadExecutor, ReadExecutorProvider, + StoreMetaDelegate, TrackVer, + }, refresh_config::{ BatchComponent as RaftStoreBatchComponent, Runner as RefreshConfigRunner, Task as RefreshConfigTask, @@ -41,6 +44,10 @@ pub use self::{ split_check::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, - split_config::{SplitConfig, SplitConfigManager}, - split_controller::{AutoSplitController, ReadStats, WriteStats}, + split_config::{ + SplitConfig, SplitConfigManager, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + DEFAULT_BIG_REGION_BYTE_THRESHOLD, DEFAULT_BIG_REGION_QPS_THRESHOLD, + DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + }, + split_controller::{AutoSplitController, ReadStats, SplitConfigChange, SplitInfo, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 44954ba5e01..74fa4d046f1 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -14,10 +14,10 @@ use std::{ time::{Duration, Instant}, }; +use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine}; -#[cfg(feature = "failpoints")] use fail::fail_point; use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::{HealthService, ServingStatus}; @@ -25,44 +25,53 @@ use kvproto::{ kvrpcpb::DiskFullOpt, metapb, pdpb, raft_cmdpb::{ - AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, + AdminCmdType, AdminRequest, BatchSwitchWitnessRequest, ChangePeerRequest, + ChangePeerV2Request, RaftCmdRequest, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, }; use ordered_float::OrderedFloat; -use pd_client::{merge_bucket_stats, metrics::*, BucketStat, Error, PdClient, RegionStat}; +use pd_client::{metrics::*, BucketStat, Error, PdClient, RegionStat}; use prometheus::local::LocalHistogram; use raft::eraftpb::ConfChangeType; use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecords}; use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, + store::QueryStats, + sys::thread::StdThreadBuildWrapper, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, topn::TopN, + trend::{RequestPerSecRecorder, Trend}, warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, }; +use txn_types::TimeStamp; use yatp::Remote; -use crate::store::{ - cmd_resp::new_error, - metrics::*, - peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, - transport::SignificantRouter, - util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}, - worker::{ - query_stats::QueryStats, - split_controller::{SplitInfo, TOP_N}, - AutoSplitController, ReadStats, WriteStats, +use crate::{ + coprocessor::CoprocessorHost, + router::RaftStoreRouter, + store::{ + cmd_resp::new_error, + metrics::*, + peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, + transport::SignificantRouter, + util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}, + worker::{ + split_controller::{SplitInfo, TOP_N}, + AutoSplitController, ReadStats, SplitConfigChange, WriteStats, + }, + Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, + RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }, - Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, - RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }; +pub const NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT: u32 = 2; + type RecordPairVec = Vec; #[derive(Default, Debug, Clone)] @@ -115,6 +124,7 @@ pub struct HeartbeatTask { pub approximate_size: Option, pub approximate_keys: Option, pub replication_status: Option, + pub wait_data_peers: Vec, } /// Uses an asynchronous thread to tell PD something. @@ -145,7 +155,7 @@ where Heartbeat(HeartbeatTask), StoreHeartbeat { stats: pdpb::StoreStats, - store_info: StoreInfo, + store_info: Option>, report: Option, dr_autosync_status: Option, }, @@ -182,8 +192,8 @@ where id: u64, duration: RaftstoreDuration, }, - RegionCPURecords(Arc), - ReportMinResolvedTS { + RegionCpuRecords(Arc), + ReportMinResolvedTs { store_id: u64, min_resolved_ts: u64, }, @@ -197,6 +207,9 @@ pub struct StoreStat { pub engine_last_total_bytes_read: u64, pub engine_last_total_keys_read: u64, pub engine_last_query_num: QueryStats, + pub engine_last_capacity_size: u64, + pub engine_last_used_size: u64, + pub engine_last_available_size: u64, pub last_report_ts: UnixSecs, pub region_bytes_read: LocalHistogram, @@ -222,6 +235,9 @@ impl Default for StoreStat { engine_total_keys_read: 0, engine_last_total_bytes_read: 0, engine_last_total_keys_read: 0, + engine_last_capacity_size: 0, + engine_last_used_size: 0, + engine_last_available_size: 0, engine_total_query_num: QueryStats::default(), engine_last_query_num: QueryStats::default(), @@ -253,7 +269,7 @@ pub struct PeerStat { } #[derive(Default)] -pub struct ReportBucket { +struct ReportBucket { current_stat: BucketStat, last_report_stat: Option, last_report_ts: UnixSecs, @@ -271,17 +287,9 @@ impl ReportBucket { self.last_report_ts = report_ts; match self.last_report_stat.replace(self.current_stat.clone()) { Some(last) => { - let mut delta = BucketStat::new( - self.current_stat.meta.clone(), - pd_client::new_bucket_stats(&self.current_stat.meta), - ); + let mut delta = BucketStat::from_meta(self.current_stat.meta.clone()); // Buckets may be changed, recalculate last stats according to current meta. - merge_bucket_stats( - &delta.meta.keys, - &mut delta.stats, - &last.meta.keys, - &last.stats, - ); + delta.merge(&last); for i in 0..delta.meta.keys.len() - 1 { delta.stats.write_bytes[i] = self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; @@ -348,7 +356,7 @@ where log_wrappers::Value::key(split_key), ), Task::AutoSplit { ref split_infos } => { - write!(f, "auto split split regions, num is {}", split_infos.len(),) + write!(f, "auto split split regions, num is {}", split_infos.len()) } Task::AskBatchSplit { ref region, @@ -404,10 +412,10 @@ where Task::UpdateSlowScore { id, ref duration } => { write!(f, "compute slow score: id {}, duration {:?}", id, duration) } - Task::RegionCPURecords(ref cpu_records) => { + Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) } - Task::ReportMinResolvedTS { + Task::ReportMinResolvedTs { store_id, min_resolved_ts, } => { @@ -428,7 +436,6 @@ const DEFAULT_LOAD_BASE_SPLIT_CHECK_INTERVAL: Duration = Duration::from_secs(1); const DEFAULT_COLLECT_TICK_INTERVAL: Duration = Duration::from_secs(1); fn default_collect_tick_interval() -> Duration { - #[cfg(feature = "failpoints")] fail_point!("mock_collect_tick_interval", |_| { Duration::from_millis(1) }); @@ -436,10 +443,12 @@ fn default_collect_tick_interval() -> Duration { } fn config(interval: Duration) -> Duration { - #[cfg(feature = "failpoints")] fail_point!("mock_min_resolved_ts_interval", |_| { Duration::from_millis(50) }); + fail_point!("mock_min_resolved_ts_interval_disable", |_| { + Duration::from_millis(0) + }); interval } @@ -455,36 +464,104 @@ fn convert_record_pairs(m: HashMap) -> RecordPairVec { .collect() } -struct StatsMonitor +#[derive(Clone)] +pub struct WrappedScheduler(Scheduler>); + +impl Collector for WrappedScheduler where EK: KvEngine, ER: RaftEngine, { - scheduler: Scheduler>, + fn collect(&self, records: Arc) { + self.0.schedule(Task::RegionCpuRecords(records)).ok(); + } +} + +pub trait StoreStatsReporter: Send + Clone + Sync + 'static + Collector { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ); + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64); + fn auto_split(&self, split_infos: Vec); +} + +impl StoreStatsReporter for WrappedScheduler +where + EK: KvEngine, + ER: RaftEngine, +{ + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::StoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send store infos to pd worker"; + "err" => ?e, + ); + } + } + + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } + } + + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } + } +} + +pub struct StatsMonitor +where + T: StoreStatsReporter, +{ + reporter: T, handle: Option>, timer: Option>, read_stats_sender: Option>, + cpu_stats_sender: Option>>, collect_store_infos_interval: Duration, load_base_split_check_interval: Duration, collect_tick_interval: Duration, report_min_resolved_ts_interval: Duration, } -impl StatsMonitor +impl StatsMonitor where - EK: KvEngine, - ER: RaftEngine, + T: StoreStatsReporter, { - pub fn new( - interval: Duration, - report_min_resolved_ts_interval: Duration, - scheduler: Scheduler>, - ) -> Self { + pub fn new(interval: Duration, report_min_resolved_ts_interval: Duration, reporter: T) -> Self { StatsMonitor { - scheduler, + reporter, handle: None, timer: None, read_stats_sender: None, + cpu_stats_sender: None, collect_store_infos_interval: interval, load_base_split_check_interval: cmp::min( DEFAULT_LOAD_BASE_SPLIT_CHECK_INTERVAL, @@ -501,11 +578,10 @@ where &mut self, mut auto_split_controller: AutoSplitController, region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, store_id: u64, ) -> Result<(), io::Error> { - if self.collect_tick_interval < default_collect_tick_interval() - || self.collect_store_infos_interval < self.collect_tick_interval - { + if self.collect_tick_interval < default_collect_tick_interval() { info!( "interval is too small, skip stats monitoring. If we are running tests, it is normal, otherwise a check is needed." ); @@ -529,7 +605,10 @@ where let (read_stats_sender, read_stats_receiver) = mpsc::channel(); self.read_stats_sender = Some(read_stats_sender); - let scheduler = self.scheduler.clone(); + let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); + self.cpu_stats_sender = Some(cpu_stats_sender); + + let reporter = self.reporter.clone(); let props = tikv_util::thread_group::current_properties(); fn is_enable_tick(timer_cnt: u64, interval: u64) -> bool { @@ -537,28 +616,47 @@ where } let h = Builder::new() .name(thd_name!("stats-monitor")) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); - let mut thread_stats = ThreadInfoStatistics::new(); + // Create different `ThreadInfoStatistics` for different purposes to + // make sure the record won't be disturbed. + let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); + let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); + let mut region_cpu_records_collector = None; + // Register the region CPU records collector. + if auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio + > 0.0 + { + region_cpu_records_collector = + Some(collector_reg_handle.register(Box::new(reporter.clone()), false)); + } while let Err(mpsc::RecvTimeoutError::Timeout) = timer_rx.recv_timeout(tick_interval) { if is_enable_tick(timer_cnt, collect_store_infos_interval) { - StatsMonitor::collect_store_infos(&mut thread_stats, &scheduler); + StatsMonitor::collect_store_infos( + &mut collect_store_infos_thread_stats, + &reporter, + ); } if is_enable_tick(timer_cnt, load_base_split_check_interval) { StatsMonitor::load_base_split( &mut auto_split_controller, &read_stats_receiver, - &scheduler, + &cpu_stats_receiver, + &mut load_base_split_thread_stats, + &reporter, + &collector_reg_handle, + &mut region_cpu_records_collector, ); } if is_enable_tick(timer_cnt, report_min_resolved_ts_interval) { - StatsMonitor::report_min_resolved_ts( - ®ion_read_progress, + reporter.report_min_resolved_ts( store_id, - &scheduler, + region_read_progress.get_min_resolved_ts(), ); } timer_cnt += 1; @@ -570,95 +668,90 @@ where Ok(()) } - pub fn collect_store_infos( - thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, - ) { + pub fn collect_store_infos(thread_stats: &mut ThreadInfoStatistics, reporter: &T) { thread_stats.record(); let cpu_usages = convert_record_pairs(thread_stats.get_cpu_usages()); let read_io_rates = convert_record_pairs(thread_stats.get_read_io_rates()); let write_io_rates = convert_record_pairs(thread_stats.get_write_io_rates()); - let task = Task::StoreInfos { - cpu_usages, - read_io_rates, - write_io_rates, - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send store infos to pd worker"; - "err" => ?e, - ); - } + reporter.report_store_infos(cpu_usages, read_io_rates, write_io_rates); } pub fn load_base_split( auto_split_controller: &mut AutoSplitController, - receiver: &Receiver, - scheduler: &Scheduler>, + read_stats_receiver: &Receiver, + cpu_stats_receiver: &Receiver>, + thread_stats: &mut ThreadInfoStatistics, + reporter: &T, + collector_reg_handle: &CollectorRegHandle, + region_cpu_records_collector: &mut Option, ) { - auto_split_controller.refresh_cfg(); - let mut others = vec![]; - while let Ok(other) = receiver.try_recv() { - others.push(other); + let start_time = TiInstant::now(); + match auto_split_controller.refresh_and_check_cfg() { + SplitConfigChange::UpdateRegionCpuCollector(is_register) => { + // If it's a deregister task, just take and drop the original collector. + if !is_register { + region_cpu_records_collector.take(); + } else { + region_cpu_records_collector.get_or_insert( + collector_reg_handle.register(Box::new(reporter.clone()), false), + ); + } + } + SplitConfigChange::Noop => {} } - let (top, split_infos) = auto_split_controller.flush(others); - auto_split_controller.clear(); - let task = Task::AutoSplit { split_infos }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send split infos to pd worker"; - "err" => ?e, - ); + let mut read_stats_vec = vec![]; + while let Ok(read_stats) = read_stats_receiver.try_recv() { + read_stats_vec.push(read_stats); + } + let mut cpu_stats_vec = vec![]; + while let Ok(cpu_stats) = cpu_stats_receiver.try_recv() { + cpu_stats_vec.push(cpu_stats); } + thread_stats.record(); + let (top_qps, split_infos) = + auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); + auto_split_controller.clear(); + reporter.auto_split(split_infos); for i in 0..TOP_N { - if i < top.len() { + if i < top_qps.len() { READ_QPS_TOPN .with_label_values(&[&i.to_string()]) - .set(top[i] as f64); + .set(top_qps[i] as f64); } else { READ_QPS_TOPN.with_label_values(&[&i.to_string()]).set(0.0); } } - } - - pub fn report_min_resolved_ts( - region_read_progress: &RegionReadProgressRegistry, - store_id: u64, - scheduler: &Scheduler>, - ) { - let min_resolved_ts = region_read_progress.with(|registry| { - registry - .iter() - .map(|(_, rrp)| rrp.safe_ts()) - .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized - .min() - .unwrap_or(0) - }); - let task = Task::ReportMinResolvedTS { - store_id, - min_resolved_ts, - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send min resolved ts to pd worker"; - "err" => ?e, - ); - } + LOAD_BASE_SPLIT_DURATION_HISTOGRAM.observe(start_time.saturating_elapsed_secs()); } pub fn stop(&mut self) { if let Some(h) = self.handle.take() { drop(self.timer.take()); drop(self.read_stats_sender.take()); + drop(self.cpu_stats_sender.take()); if let Err(e) = h.join() { error!("join stats collector failed"; "err" => ?e); } } } - pub fn get_read_stats_sender(&self) -> &Option> { - &self.read_stats_sender + #[inline] + pub fn maybe_send_read_stats(&self, read_stats: ReadStats) { + if let Some(sender) = &self.read_stats_sender { + if sender.send(read_stats).is_err() { + warn!("send read_stats failed, are we shutting down?") + } + } + } + + #[inline] + pub fn maybe_send_cpu_stats(&self, cpu_stats: &Arc) { + if let Some(sender) = &self.cpu_stats_sender { + if sender.send(cpu_stats.clone()).is_err() { + warn!("send region cpu info failed, are we shutting down?") + } + } } } @@ -669,31 +762,32 @@ const HOTSPOT_REPORT_CAPACITY: usize = 1000; // TODO: support dynamic configure threshold in future. fn hotspot_key_report_threshold() -> u64 { - #[cfg(feature = "failpoints")] fail_point!("mock_hotspot_threshold", |_| { 0 }); HOTSPOT_KEY_RATE_THRESHOLD * 10 } fn hotspot_byte_report_threshold() -> u64 { - #[cfg(feature = "failpoints")] fail_point!("mock_hotspot_threshold", |_| { 0 }); HOTSPOT_BYTE_RATE_THRESHOLD * 10 } fn hotspot_query_num_report_threshold() -> u64 { - #[cfg(feature = "failpoints")] fail_point!("mock_hotspot_threshold", |_| { 0 }); HOTSPOT_QUERY_RATE_THRESHOLD * 10 } -// Slow score is a value that represents the speed of a store and ranges in [1, 100]. -// It is maintained in the AIMD way. -// If there are some inspecting requests timeout during a round, by default the score -// will be increased at most 1x when above 10% inspecting requests timeout. -// If there is not any timeout inspecting requests, the score will go back to 1 in at least 5min. +/// Max limitation of delayed store_heartbeat. +const STORE_HEARTBEAT_DELAY_LIMIT: u64 = 5 * 60; + +// Slow score is a value that represents the speed of a store and ranges in [1, +// 100]. It is maintained in the AIMD way. +// If there are some inspecting requests timeout during a round, by default the +// score will be increased at most 1x when above 10% inspecting requests +// timeout. If there is not any timeout inspecting requests, the score will go +// back to 1 in at least 5min. struct SlowScore { value: OrderedFloat, last_record_time: Instant, @@ -784,36 +878,9 @@ impl SlowScore { self.last_update_time = Instant::now(); self.value } -} - -// RegionCPUMeteringCollector is used to collect the region-related CPU info. -struct RegionCPUMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - scheduler: Scheduler>, -} - -impl RegionCPUMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn new(scheduler: Scheduler>) -> RegionCPUMeteringCollector { - RegionCPUMeteringCollector { scheduler } - } -} -impl Collector for RegionCPUMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn collect(&self, records: Arc) { - self.scheduler - .schedule(Task::RegionCPURecords(records)) - .ok(); + fn should_force_report_slow_store(&self) -> bool { + self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) } } @@ -837,9 +904,9 @@ where // actually it is the sender connected to Runner's Worker which // calls Runner's run() on Task received. scheduler: Scheduler>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, + store_heartbeat_interval: Duration, - _region_cpu_records_collector: CollectorGuard, // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, @@ -847,10 +914,15 @@ where snap_mgr: SnapManager, remote: Remote, slow_score: SlowScore, + slow_trend_cause: Trend, + slow_trend_result: Trend, + slow_trend_result_recorder: RequestPerSecRecorder, // The health status of the store is updated by the slow score mechanism. health_service: Option, curr_health_status: ServingStatus, + coprocessor_host: CoprocessorHost, + causal_ts_provider: Option>, // used for rawkv apiv2 } impl Runner @@ -859,15 +931,12 @@ where ER: RaftEngine, T: PdClient + 'static, { - const INTERVAL_DIVISOR: u32 = 2; - pub fn new( cfg: &Config, store_id: u64, pd_client: Arc, router: RaftRouter, scheduler: Scheduler>, - store_heartbeat_interval: Duration, auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, snap_mgr: SnapManager, @@ -875,22 +944,25 @@ where collector_reg_handle: CollectorRegHandle, region_read_progress: RegionReadProgressRegistry, health_service: Option, + coprocessor_host: CoprocessorHost, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Runner { - let interval = store_heartbeat_interval / Self::INTERVAL_DIVISOR; + let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; + let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( interval, cfg.report_min_resolved_ts_interval.0, - scheduler.clone(), + WrappedScheduler(scheduler.clone()), ); - if let Err(e) = stats_monitor.start(auto_split_controller, region_read_progress, store_id) { + if let Err(e) = stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + ) { error!("failed to start stats collector, error = {:?}", e); } - let _region_cpu_records_collector = collector_reg_handle.register( - Box::new(RegionCPUMeteringCollector::new(scheduler.clone())), - true, - ); - Runner { store_id, pd_client, @@ -901,15 +973,50 @@ where store_stat: StoreStat::default(), start_ts: UnixSecs::now(), scheduler, + store_heartbeat_interval, stats_monitor, - _region_cpu_records_collector, region_cpu_records: HashMap::default(), concurrency_manager, snap_mgr, remote, slow_score: SlowScore::new(cfg.inspect_interval.0), + slow_trend_cause: Trend::new( + // Disable SpikeFilter for now + Duration::from_secs(0), + STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), + STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), + Duration::from_secs(180), + Duration::from_secs(30), + Duration::from_secs(120), + Duration::from_secs(600), + 1, + tikv_util::time::duration_to_us(Duration::from_micros(500)), + STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L1"]), + STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L2"]), + cfg.slow_trend_unsensitive_cause, + ), + slow_trend_result: Trend::new( + // Disable SpikeFilter for now + Duration::from_secs(0), + STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), + STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), + Duration::from_secs(120), + Duration::from_secs(15), + Duration::from_secs(60), + Duration::from_secs(300), + 1, + 2000, + STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC + .with_label_values(&["L1"]), + STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC + .with_label_values(&["L2"]), + cfg.slow_trend_unsensitive_result, + ), + slow_trend_result_recorder: RequestPerSecRecorder::new(), health_service, curr_health_status: ServingStatus::Serving, + coprocessor_host, + causal_ts_provider, } } @@ -1013,9 +1120,10 @@ where Default::default(), ); } - // When rolling update, there might be some old version tikvs that don't support batch split in cluster. - // In this situation, PD version check would refuse `ask_batch_split`. - // But if update time is long, it may cause large Regions, so call `ask_split` instead. + // When rolling update, there might be some old version tikvs that don't support + // batch split in cluster. In this situation, PD version check would refuse + // `ask_batch_split`. But if update time is long, it may cause large Regions, so + // call `ask_split` instead. Err(Error::Incompatible) => { let (region_id, peer_id) = (region.id, peer.id); info!( @@ -1100,22 +1208,10 @@ where fn handle_store_heartbeat( &mut self, mut stats: pdpb::StoreStats, - store_info: StoreInfo, + store_info: Option>, store_report: Option, dr_autosync_status: Option, ) { - let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { - Err(e) => { - error!( - "get disk stat for rocksdb failed"; - "engine_path" => store_info.kv_engine.path(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let mut report_peers = HashMap::default(); for (region_id, region_peer) in &mut self.region_peers { let read_bytes = region_peer.read_bytes - region_peer.last_store_report_read_bytes; @@ -1143,34 +1239,35 @@ where } stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); - - let disk_cap = disk_stats.total_space(); - let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { - disk_cap + let (capacity, used_size, available) = if store_info.is_some() { + match collect_engine_size( + &self.coprocessor_host, + store_info.as_ref(), + self.snap_mgr.get_total_snap_size().unwrap(), + ) { + Some((capacity, used_size, available)) => { + // Update last reported infos on engine_size. + self.store_stat.engine_last_capacity_size = capacity; + self.store_stat.engine_last_used_size = used_size; + self.store_stat.engine_last_available_size = available; + (capacity, used_size, available) + } + None => return, + } } else { - store_info.capacity + ( + self.store_stat.engine_last_capacity_size, + self.store_stat.engine_last_used_size, + self.store_stat.engine_last_available_size, + ) }; - stats.set_capacity(capacity); - let used_size = self.snap_mgr.get_total_snap_size().unwrap() - + store_info - .kv_engine - .get_engine_used_size() - .expect("kv engine used size") - + store_info - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + stats.set_capacity(capacity); stats.set_used_size(used_size); - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - // We only care about rocksdb SST file size, so we should check disk available here. - available = cmp::min(available, disk_stats.available_space()); - if available == 0 { warn!("no available space"); } - stats.set_available(available); stats.set_bytes_read( self.store_stat.engine_total_bytes_read - self.store_stat.engine_last_total_bytes_read, @@ -1186,6 +1283,9 @@ where .store_stat .engine_total_query_num .sub_query_stats(&self.store_stat.engine_last_query_num); + let total_query_num = self + .slow_trend_result_recorder + .record_and_get_current_rps(res.get_all_query_num(), Instant::now()); stats.set_query_stats(res.0); stats.set_cpu_usages(self.store_stat.store_cpu_usages.clone().into()); @@ -1200,7 +1300,14 @@ where self.store_stat .engine_last_query_num .fill_query_stats(&self.store_stat.engine_total_query_num); - self.store_stat.last_report_ts = UnixSecs::now(); + self.store_stat.last_report_ts = if store_info.is_some() { + UnixSecs::now() + } else { + // If `store_info` is None, the given Task::StoreHeartbeat should be a fake + // heartbeat to PD, we won't update the last_report_ts to avoid incorrectly + // marking current TiKV node in normal state. + self.store_stat.last_report_ts + }; self.store_stat.region_bytes_written.flush(); self.store_stat.region_keys_written.flush(); self.store_stat.region_bytes_read.flush(); @@ -1218,6 +1325,7 @@ where let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); + self.set_slow_trend_to_store_stats(&mut stats, total_query_num); let router = self.router.clone(); let resp = self @@ -1287,6 +1395,14 @@ where } } } + // Forcely awaken all hibernated regions if there existed slow stores in this + // cluster. + if let Some(awaken_regions) = resp.awaken_regions.take() { + info!("forcely awaken hibernated regions in this store"); + let _ = router.send_store_msg(StoreMsg::AwakenRegions { + abnormal_stores: awaken_regions.get_abnormal_stores().to_vec(), + }); + } } Err(e) => { error!("store heartbeat failed"; "err" => ?e); @@ -1296,6 +1412,51 @@ where self.remote.spawn(f); } + fn set_slow_trend_to_store_stats( + &mut self, + stats: &mut pdpb::StoreStats, + total_query_num: Option, + ) { + let slow_trend_cause_rate = self.slow_trend_cause.increasing_rate(); + STORE_SLOW_TREND_GAUGE.set(slow_trend_cause_rate); + let mut slow_trend = pdpb::SlowTrend::default(); + slow_trend.set_cause_rate(slow_trend_cause_rate); + slow_trend.set_cause_value(self.slow_trend_cause.l0_avg()); + if let Some(total_query_num) = total_query_num { + self.slow_trend_result + .record(total_query_num as u64, Instant::now()); + slow_trend.set_result_value(self.slow_trend_result.l0_avg()); + let slow_trend_result_rate = self.slow_trend_result.increasing_rate(); + slow_trend.set_result_rate(slow_trend_result_rate); + STORE_SLOW_TREND_RESULT_GAUGE.set(slow_trend_result_rate); + STORE_SLOW_TREND_RESULT_VALUE_GAUGE.set(total_query_num); + } else { + // Just to mark the invalid range on the graphic + STORE_SLOW_TREND_RESULT_VALUE_GAUGE.set(-100.0); + } + stats.set_slow_trend(slow_trend); + self.write_slow_trend_metrics(); + } + + fn write_slow_trend_metrics(&mut self) { + STORE_SLOW_TREND_L0_GAUGE.set(self.slow_trend_cause.l0_avg()); + STORE_SLOW_TREND_L1_GAUGE.set(self.slow_trend_cause.l1_avg()); + STORE_SLOW_TREND_L2_GAUGE.set(self.slow_trend_cause.l2_avg()); + STORE_SLOW_TREND_L0_L1_GAUGE.set(self.slow_trend_cause.l0_l1_rate()); + STORE_SLOW_TREND_L1_L2_GAUGE.set(self.slow_trend_cause.l1_l2_rate()); + STORE_SLOW_TREND_L1_MARGIN_ERROR_GAUGE.set(self.slow_trend_cause.l1_margin_error_base()); + STORE_SLOW_TREND_L2_MARGIN_ERROR_GAUGE.set(self.slow_trend_cause.l2_margin_error_base()); + STORE_SLOW_TREND_RESULT_L0_GAUGE.set(self.slow_trend_result.l0_avg()); + STORE_SLOW_TREND_RESULT_L1_GAUGE.set(self.slow_trend_result.l1_avg()); + STORE_SLOW_TREND_RESULT_L2_GAUGE.set(self.slow_trend_result.l2_avg()); + STORE_SLOW_TREND_RESULT_L0_L1_GAUGE.set(self.slow_trend_result.l0_l1_rate()); + STORE_SLOW_TREND_RESULT_L1_L2_GAUGE.set(self.slow_trend_result.l1_l2_rate()); + STORE_SLOW_TREND_RESULT_L1_MARGIN_ERROR_GAUGE + .set(self.slow_trend_result.l1_margin_error_base()); + STORE_SLOW_TREND_RESULT_L2_MARGIN_ERROR_GAUGE + .set(self.slow_trend_result.l2_margin_error_base()); + } + fn handle_report_batch_split(&self, regions: Vec) { let resp = self.pd_client.report_batch_split(regions); let f = async move { @@ -1448,6 +1609,8 @@ where } else { CasualMessage::HalfSplitRegion { region_epoch: epoch, + start_key: None, + end_key: None, policy: split_region.get_policy(), source: "pd", cb: Callback::None, @@ -1466,6 +1629,18 @@ where deadline:None, disk_full_opt:DiskFullOpt::AllowedOnAlmostFull, }); + } else if resp.has_switch_witnesses() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["switch witness"]) + .inc(); + + let mut switches = resp.take_switch_witnesses(); + info!("try to switch witness"; + "region_id" => region_id, + "switch witness" => ?switches + ); + let req = new_batch_switch_witness(switches.take_switch_witnesses().into()); + send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); } else { PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); } @@ -1506,11 +1681,7 @@ where self.merge_buckets(region_buckets); } if !read_stats.region_infos.is_empty() { - if let Some(sender) = self.stats_monitor.get_read_stats_sender() { - if sender.send(read_stats).is_err() { - warn!("send read_stats failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_read_stats(read_stats); } } @@ -1553,10 +1724,30 @@ where ) { let pd_client = self.pd_client.clone(); let concurrency_manager = self.concurrency_manager.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); + let f = async move { let mut success = false; while txn_ext.max_ts_sync_status.load(Ordering::SeqCst) == initial_status { - match pd_client.get_tso().await { + // On leader transfer / region merge, RawKV API v2 need to invoke + // causal_ts_provider.flush() to renew cached TSO, to ensure that + // the next TSO returned by causal_ts_provider.get_ts() on current + // store must be larger than the store where the leader is on before. + // + // And it won't break correctness of transaction commands, as + // causal_ts_provider.flush() is implemented as pd_client.get_tso() + renew TSO + // cached. + let res: crate::Result = + if let Some(causal_ts_provider) = &causal_ts_provider { + causal_ts_provider + .async_flush() + .await + .map_err(|e| box_err!(e)) + } else { + pd_client.get_tso().await.map_err(Into::into) + }; + + match res { Ok(ts) => { concurrency_manager.update_max_ts(ts); // Set the least significant bit to 1 to mark it as synced. @@ -1637,6 +1828,8 @@ where // which is the read load portion of the write path. // TODO: more accurate CPU consumption of a specified region. fn handle_region_cpu_records(&mut self, records: Arc) { + // Send Region CPU info to AutoSplitController inside the stats_monitor. + self.stats_monitor.maybe_send_cpu_stats(&records); calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); } @@ -1690,13 +1883,7 @@ where if current.meta < buckets.meta { mem::swap(current, &mut buckets); } - - merge_bucket_stats( - ¤t.meta.keys, - &mut current.stats, - &buckets.meta.keys, - &buckets.stats, - ); + current.merge(&buckets); }) .or_insert_with(|| ReportBucket::new(buckets)); } @@ -1707,6 +1894,43 @@ where health_service.set_serving_status("", status); } } + + /// Force to send a special heartbeat to pd when current store is hung on + /// some special circumstances, i.e. disk busy, handler busy and others. + fn handle_fake_store_heartbeat(&mut self) { + let mut stats = pdpb::StoreStats::default(); + stats.set_store_id(self.store_id); + stats.set_region_count(self.region_peers.len() as u32); + + let snap_stats = self.snap_mgr.stats(); + stats.set_sending_snap_count(snap_stats.sending_count as u32); + stats.set_receiving_snap_count(snap_stats.receiving_count as u32); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["sending"]) + .set(snap_stats.sending_count as i64); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["receiving"]) + .set(snap_stats.receiving_count as i64); + + stats.set_start_time(self.start_ts.into_inner() as u32); + + // This calling means that the current node cannot report heartbeat in normaly + // scheduler. That is, the current node must in `busy` state. + stats.set_is_busy(true); + + // We do not need to report store_info, so we just set `None` here. + self.handle_store_heartbeat(stats, None, None, None); + warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; + "store_id" => self.store_id, + ); + } + + fn is_store_heartbeat_delayed(&self) -> bool { + let now = UnixSecs::now(); + let interval_second = now.into_inner() - self.store_stat.last_report_ts.into_inner(); + (interval_second >= self.store_heartbeat_interval.as_secs()) + && (interval_second <= STORE_HEARTBEAT_DELAY_LIMIT) + } } fn calculate_region_cpu_records( @@ -1781,21 +2005,44 @@ where let f = async move { for split_info in split_infos { - if let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await - { + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { Self::handle_ask_batch_split( router.clone(), scheduler.clone(), pd_client.clone(), region, - vec![split_info.split_key], + vec![split_key], split_info.peer, true, Callback::None, String::from("auto_split"), remote.clone(), ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + let start_key = split_info.start_key.unwrap(); + let end_key = split_info.end_key.unwrap(); + let region_id = region.get_id(); + let msg = CasualMessage::HalfSplitRegion { + region_epoch: region.get_region_epoch().clone(), + start_key: Some(start_key.clone()), + end_key: Some(end_key.clone()), + policy: pdpb::CheckPolicy::Scan, + source: "auto_split", + cb: Callback::None, + }; + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + error!("send auto half split request failed"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => ?e, + ); + } } } }; @@ -1863,12 +2110,12 @@ where unix_secs_now.into_inner() - last_report_ts.into_inner(); // Keep consistent with the calculation of cpu_usages in a store heartbeat. // See components/tikv_util/src/metrics/threads_linux.rs for more details. - (interval_second > 0) - .then(|| { - ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) - as u64 - }) - .unwrap_or(0) + if interval_second > 0 { + ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) + as u64 + } else { + 0 + } }; ( read_bytes_delta, @@ -1922,9 +2169,15 @@ where txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), - Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), - Task::RegionCPURecords(records) => self.handle_region_cpu_records(records), - Task::ReportMinResolvedTS { + Task::UpdateSlowScore { id, duration } => { + self.slow_score.record(id, duration.sum()); + self.slow_trend_cause.record( + tikv_util::time::duration_to_us(duration.store_wait_duration.unwrap()), + Instant::now(), + ); + } + Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), + Task::ReportMinResolvedTs { store_id, min_resolved_ts, } => self.handle_report_min_resolved_ts(store_id, min_resolved_ts), @@ -1946,6 +2199,9 @@ where T: PdClient + 'static, { fn on_timeout(&mut self) { + // Record a fairly great value when timeout + self.slow_trend_cause.record(500_000, Instant::now()); + // The health status is recovered to serving as long as any tick // does not timeout. if self.curr_health_status == ServingStatus::ServiceUnknown @@ -1955,6 +2211,13 @@ where } if !self.slow_score.last_tick_finished { self.slow_score.record_timeout(); + // If the last slow_score already reached abnormal state and was delayed for + // reporting by `store-heartbeat` to PD, we should report it here manually as + // a FAKE `store-heartbeat`. + if self.slow_score.should_force_report_slow_store() && self.is_store_heartbeat_delayed() + { + self.handle_fake_store_heartbeat(); + } } let scheduler = self.scheduler.clone(); let id = self.slow_score.last_tick_id + 1; @@ -1962,8 +2225,8 @@ where self.slow_score.last_tick_finished = false; if self.slow_score.last_tick_id % self.slow_score.round_ticks == 0 { - // `last_update_time` is refreshed every round. If no update happens in a whole round, - // we set the status to unknown. + // `last_update_time` is refreshed every round. If no update happens in a whole + // round, we set the status to unknown. if self.curr_health_status == ServingStatus::Serving && self.slow_score.last_record_time < self.slow_score.last_update_time { @@ -2087,6 +2350,24 @@ fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { req } +fn new_batch_switch_witness(switches: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSwitchWitness); + let switch_reqs = switches + .into_iter() + .map(|s| { + let mut sw = SwitchWitnessRequest::default(); + sw.set_peer_id(s.get_peer_id()); + sw.set_is_witness(s.get_is_witness()); + sw + }) + .collect(); + let mut sw = BatchSwitchWitnessRequest::default(); + sw.set_switch_witnesses(switch_reqs); + req.set_switch_witnesses(sw); + req +} + fn send_admin_request( router: &RaftRouter, region_id: u64, @@ -2189,6 +2470,48 @@ fn collect_report_read_peer_stats( stats } +fn collect_engine_size( + coprocessor_host: &CoprocessorHost, + store_info: Option<&StoreInfo>, + snap_mgr_size: u64, +) -> Option<(u64, u64, u64)> { + if let Some(engine_size) = coprocessor_host.on_compute_engine_size() { + return Some((engine_size.capacity, engine_size.used, engine_size.avail)); + } + let store_info = store_info.unwrap(); + let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { + Err(e) => { + error!( + "get disk stat for rocksdb failed"; + "engine_path" => store_info.kv_engine.path(), + "err" => ?e + ); + return None; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { + disk_cap + } else { + store_info.capacity + }; + let used_size = snap_mgr_size + + store_info + .kv_engine + .get_engine_used_size() + .expect("kv engine used size") + + store_info + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + // We only care about rocksdb SST file size, so we should check disk available + // here. + available = cmp::min(available, disk_stats.available_space()); + Some((capacity, used_size, available)) +} + fn get_read_query_num(stat: &pdpb::QueryStats) -> u64 { stat.get_get() + stat.get_coprocessor() + stat.get_scan() } @@ -2216,7 +2539,7 @@ mod tests { struct RunnerTest { store_stat: Arc>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, } impl RunnerTest { @@ -2228,13 +2551,16 @@ mod tests { let mut stats_monitor = StatsMonitor::new( Duration::from_secs(interval), Duration::from_secs(0), - scheduler, + WrappedScheduler(scheduler), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); - if let Err(e) = - stats_monitor.start(AutoSplitController::default(), region_read_progress, 1) - { + if let Err(e) = stats_monitor.start( + AutoSplitController::default(), + region_read_progress, + CollectorRegHandle::new_for_test(), + 1, + ) { error!("failed to start stats collector, error = {:?}", e); } @@ -2379,9 +2705,12 @@ mod tests { ); } + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use metapb::Peer; use resource_metering::{RawRecord, TagInfos}; + use crate::coprocessor::{BoxPdTaskObserver, Coprocessor, PdTaskObserver, StoreSizeInfo}; + #[test] fn test_calculate_region_cpu_records() { // region_id -> total_cpu_time_ms @@ -2485,4 +2814,36 @@ mod tests { assert_eq!(report.stats.get_read_qps(), expected); } } + + #[derive(Debug, Clone, Default)] + struct PdObserver {} + + impl Coprocessor for PdObserver {} + + impl PdTaskObserver for PdObserver { + fn on_compute_engine_size(&self, s: &mut Option) { + let _ = s.insert(StoreSizeInfo { + capacity: 444, + used: 111, + avail: 333, + }); + } + } + + #[test] + fn test_pd_task_observer() { + let mut host = CoprocessorHost::::default(); + let obs = PdObserver::default(); + host.registry + .register_pd_task_observer(1, BoxPdTaskObserver::new(obs)); + let store_size = collect_engine_size::(&host, None, 0); + let (cap, used, avail) = if let Some((cap, used, avail)) = store_size { + (cap, used, avail) + } else { + panic!("store_size should not be none"); + }; + assert_eq!(cap, 444); + assert_eq!(used, 111); + assert_eq!(avail, 333); + } } diff --git a/components/raftstore/src/store/worker/raftlog_fetch.rs b/components/raftstore/src/store/worker/raftlog_fetch.rs deleted file mode 100644 index 63bccf6324a..00000000000 --- a/components/raftstore/src/store/worker/raftlog_fetch.rs +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::fmt; - -use engine_traits::{KvEngine, RaftEngine}; -use fail::fail_point; -use raft::GetEntriesContext; -use tikv_util::worker::Runnable; - -use crate::store::{RaftlogFetchResult, SignificantMsg, SignificantRouter, MAX_INIT_ENTRY_COUNT}; - -pub enum Task { - PeerStorage { - region_id: u64, - context: GetEntriesContext, - low: u64, - high: u64, - max_size: usize, - tried_cnt: usize, - term: u64, - }, - // More to support, suck as fetch entries ayschronously when apply and schedule merge -} - -impl fmt::Display for Task { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Task::PeerStorage { - region_id, - context, - low, - high, - max_size, - tried_cnt, - term, - } => write!( - f, - "Fetch Raft Logs [region: {}, low: {}, high: {}, max_size: {}] for sending with context {:?}, tried: {}, term: {}", - region_id, low, high, max_size, context, tried_cnt, term, - ), - } - } -} - -pub struct Runner -where - EK: KvEngine, - ER: RaftEngine, - R: SignificantRouter, -{ - router: R, - raft_engine: ER, - _phantom: std::marker::PhantomData, -} - -impl> Runner { - pub fn new(router: R, raft_engine: ER) -> Runner { - Runner { - router, - raft_engine, - _phantom: std::marker::PhantomData, - } - } -} - -impl Runnable for Runner -where - EK: KvEngine, - ER: RaftEngine, - R: SignificantRouter, -{ - type Task = Task; - - fn run(&mut self, task: Task) { - match task { - Task::PeerStorage { - region_id, - low, - high, - max_size, - context, - tried_cnt, - term, - } => { - let mut ents = - Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); - let res = self.raft_engine.fetch_entries_to( - region_id, - low, - high, - Some(max_size), - &mut ents, - ); - - let hit_size_limit = res - .as_ref() - .map(|c| (*c as u64) != high - low) - .unwrap_or(false); - fail_point!("worker_async_fetch_raft_log"); - // it may return a region not found error as the region could be merged. - let _ = self.router.significant_send( - region_id, - SignificantMsg::RaftlogFetched { - context, - res: Box::new(RaftlogFetchResult { - ents: res.map(|_| ents).map_err(|e| e.into()), - low, - max_size: max_size as u64, - hit_size_limit, - tried_cnt, - term, - }), - }, - ); - } - } - } -} diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index bf892743300..3edabae71a0 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -3,11 +3,10 @@ use std::{ error::Error as StdError, fmt::{self, Display, Formatter}, - sync::mpsc::Sender, }; -use engine_traits::{Engines, KvEngine, RaftEngine, RaftLogGCTask}; -use file_system::{IOType, WithIOType}; +use engine_traits::{Engines, KvEngine, RaftEngine}; +use file_system::{IoType, WithIoType}; use thiserror::Error; use tikv_util::{ box_try, debug, error, @@ -73,7 +72,6 @@ enum Error { pub struct Runner { tasks: Vec, engines: Engines, - gc_entries: Option>, compact_sync_interval: Duration, } @@ -82,40 +80,34 @@ impl Runner { Runner { engines, tasks: vec![], - gc_entries: None, compact_sync_interval: compact_log_interval, } } - /// Does the GC job and returns the count of logs collected. - fn gc_raft_log(&mut self, regions: Vec) -> Result { - fail::fail_point!("worker_gc_raft_log", |s| { - Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) - }); - let deleted = box_try!(self.engines.raft.batch_gc(regions)); - fail::fail_point!("worker_gc_raft_log_finished", |_| { Ok(deleted) }); - Ok(deleted) - } - - fn report_collected(&self, collected: usize) { - if let Some(ref ch) = self.gc_entries { - ch.send(collected).unwrap(); - } + fn raft_log_gc(&mut self, mut batch: ER::LogBatch) -> Result<(), Error> { + fail::fail_point!("worker_gc_raft_log", |_| Ok(())); + box_try!(self.engines.raft.consume(&mut batch, false)); + fail::fail_point!("worker_gc_raft_log_finished"); + Ok(()) } fn flush(&mut self) { if self.tasks.is_empty() { return; } - // Sync wal of kv_db to make sure the data before apply_index has been persisted to disk. + fail::fail_point!("worker_gc_raft_log_flush"); + // Sync wal of kv_db to make sure the data before apply_index has been persisted + // to disk. let start = Instant::now(); self.engines.kv.sync().unwrap_or_else(|e| { panic!("failed to sync kv_engine in raft_log_gc: {:?}", e); }); RAFT_LOG_GC_KV_SYNC_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); + let tasks = std::mem::take(&mut self.tasks); - let mut groups = Vec::with_capacity(tasks.len()); let mut cbs = Vec::new(); + let mut batch = self.engines.raft.log_batch(tasks.len()); + let start = Instant::now(); for t in tasks { debug!("gc raft log"; "region_id" => t.region_id, "start_index" => t.start_idx, "end_index" => t.end_idx); if let Some(cb) = t.cb { @@ -135,28 +127,22 @@ impl Runner { "end_index" => t.end_idx, ); } - groups.push(RaftLogGCTask { - raft_group_id: t.region_id, - from: t.start_idx, - to: t.end_idx, - }); - } - let start = Instant::now(); - match self.gc_raft_log(groups) { - Err(e) => { + if let Err(e) = self + .engines + .raft + .gc(t.region_id, t.start_idx, t.end_idx, &mut batch) + { error!("failed to gc"; "err" => %e); - self.report_collected(0); RAFT_LOG_GC_FAILED.inc(); } - Ok(n) => { - debug!("gc log entries"; "entry_count" => n); - self.report_collected(n); - RAFT_LOG_GC_DELETED_KEYS_HISTOGRAM.observe(n as f64); - } + } + if let Err(e) = self.raft_log_gc(batch) { + error!("failed to write gc task"; "err" => %e); + RAFT_LOG_GC_FAILED.inc(); } RAFT_LOG_GC_WRITE_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); for cb in cbs { - cb() + cb(); } } } @@ -169,7 +155,7 @@ where type Task = Task; fn run(&mut self, task: Task) { - let _io_type_guard = WithIOType::new(IOType::ForegroundWrite); + let _io_type_guard = WithIoType::new(IoType::ForegroundWrite); let flush_now = task.flush; self.tasks.push(task); // TODO: maybe they should also be batched even `flush_now` is true. @@ -199,7 +185,7 @@ where #[cfg(test)] mod tests { - use std::{sync::mpsc, time::Duration}; + use std::time::Duration; use engine_traits::{RaftEngine, RaftLogBatch, ALL_CFS}; use raft::eraftpb::Entry; @@ -213,13 +199,10 @@ mod tests { let path_raft = dir.path().join("raft"); let path_kv = dir.path().join("kv"); let raft_db = engine_test::raft::new_engine(path_kv.to_str().unwrap(), None).unwrap(); - let kv_db = - engine_test::kv::new_engine(path_raft.to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let kv_db = engine_test::kv::new_engine(path_raft.to_str().unwrap(), ALL_CFS).unwrap(); let engines = Engines::new(kv_db, raft_db.clone()); - let (tx, rx) = mpsc::channel(); let mut runner = Runner { - gc_entries: Some(tx), engines, tasks: vec![], compact_sync_interval: Duration::from_secs(5), @@ -231,22 +214,20 @@ mod tests { for i in 0..100 { let mut e = Entry::new(); e.set_index(i); - raft_wb.append(region_id, vec![e]).unwrap(); + raft_wb.append(region_id, None, vec![e]).unwrap(); } - raft_db.consume(&mut raft_wb, false /*sync*/).unwrap(); + raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); let tbls = vec![ - (Task::gc(region_id, 0, 10), 10, (0, 10), (10, 100)), - (Task::gc(region_id, 0, 50), 40, (0, 50), (50, 100)), - (Task::gc(region_id, 50, 50), 0, (0, 50), (50, 100)), - (Task::gc(region_id, 50, 60), 10, (0, 60), (60, 100)), + (Task::gc(region_id, 0, 10), (0, 10), (10, 100)), + (Task::gc(region_id, 0, 50), (0, 50), (50, 100)), + (Task::gc(region_id, 50, 50), (0, 50), (50, 100)), + (Task::gc(region_id, 50, 60), (0, 60), (60, 100)), ]; - for (task, expected_collectd, not_exist_range, exist_range) in tbls { + for (task, not_exist_range, exist_range) in tbls { runner.run(task); runner.flush(); - let res = rx.recv_timeout(Duration::from_secs(3)).unwrap(); - assert_eq!(res, expected_collectd); raft_log_must_not_exist(&raft_db, 1, not_exist_range.0, not_exist_range.1); raft_log_must_exist(&raft_db, 1, exist_range.0, exist_range.1); } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a506ab80f17..022bd457cd5 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -4,20 +4,20 @@ use std::{ cell::Cell, fmt::{self, Display, Formatter}, + ops::Deref, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{self, AtomicU64, Ordering}, Arc, Mutex, }, - time::Duration, }; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; -use engine_traits::{KvEngine, RaftEngine, Snapshot}; +use engine_traits::{KvEngine, Peekable, RaftEngine}; use fail::fail_point; use kvproto::{ errorpb, kvrpcpb::ExtraOp as TxnExtraOp, - metapb, + metapb::{self, Region}, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, ReadIndexResponse, Request, Response}, }; use pd_client::BucketMeta; @@ -25,9 +25,11 @@ use tikv_util::{ codec::number::decode_u64, debug, error, lru::LruCache, - time::{monotonic_raw_now, Instant, ThreadReadId}, + store::find_peer_by_id, + time::{monotonic_raw_now, ThreadReadId}, }; use time::Timespec; +use txn_types::TimeStamp; use super::metrics::*; use crate::{ @@ -36,26 +38,42 @@ use crate::{ cmd_resp, fsm::store::StoreMeta, util::{self, LeaseState, RegionReadProgress, RemoteLease}, - Callback, CasualMessage, CasualRouter, Peer, ProposalRouter, RaftCommand, ReadResponse, - RegionSnapshot, RequestInspector, RequestPolicy, TxnExt, + Callback, CasualMessage, CasualRouter, Peer, ProposalRouter, RaftCommand, ReadCallback, + ReadResponse, RegionSnapshot, RequestInspector, RequestPolicy, TxnExt, }, Error, Result, }; -pub trait ReadExecutor { - fn get_engine(&self) -> &E; - fn get_snapshot(&mut self, ts: Option) -> Arc; +/// #[RaftstoreCommon] +pub trait ReadExecutor { + type Tablet: KvEngine; - fn get_value(&self, req: &Request, region: &metapb::Region) -> Result { + fn get_tablet(&mut self) -> &Self::Tablet; + + /// Get the snapshot fo the tablet. + /// + /// If the tablet is not ready, `None` is returned. + /// Currently, only multi-rocksdb version may return `None`. + fn get_snapshot( + &mut self, + read_context: &Option>, + ) -> Arc<::Snapshot>; + + fn get_value( + &mut self, + req: &Request, + region: &metapb::Region, + read_context: &Option>, + ) -> Result { let key = req.get_get().get_key(); // region key range has no data prefix, so we must use origin key to check. util::check_key_in_region(key, region)?; - let engine = self.get_engine(); let mut resp = Response::default(); + let snapshot = self.get_snapshot(read_context); let res = if !req.get_get().get_cf().is_empty() { let cf = req.get_get().get_cf(); - engine + snapshot .get_value_cf(cf, &keys::data_key(key)) .unwrap_or_else(|e| { panic!( @@ -67,14 +85,16 @@ pub trait ReadExecutor { ) }) } else { - engine.get_value(&keys::data_key(key)).unwrap_or_else(|e| { - panic!( - "[region {}] failed to get {}: {:?}", - region.get_id(), - log_wrappers::Value::key(key), - e - ) - }) + snapshot + .get_value(&keys::data_key(key)) + .unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {}: {:?}", + region.get_id(), + log_wrappers::Value::key(key), + e + ) + }) }; if let Some(res) = res { resp.mut_get().set_value(res.to_vec()); @@ -88,8 +108,8 @@ pub trait ReadExecutor { msg: &RaftCmdRequest, region: &Arc, read_index: Option, - mut ts: Option, - ) -> ReadResponse { + local_read_ctx: Option>, + ) -> ReadResponse<::Snapshot> { let requests = msg.get_requests(); let mut response = ReadResponse { response: RaftCmdResponse::default(), @@ -100,7 +120,7 @@ pub trait ReadExecutor { for req in requests { let cmd_type = req.get_cmd_type(); let mut resp = match cmd_type { - CmdType::Get => match self.get_value(req, region.as_ref()) { + CmdType::Get => match self.get_value(req, region.as_ref(), &local_read_ctx) { Ok(resp) => resp, Err(e) => { error!(?e; @@ -112,8 +132,10 @@ pub trait ReadExecutor { } }, CmdType::Snap => { - let snapshot = - RegionSnapshot::from_snapshot(self.get_snapshot(ts.take()), region.clone()); + let snapshot = RegionSnapshot::from_snapshot( + self.get_snapshot(&local_read_ctx), + region.clone(), + ); response.snapshot = Some(snapshot); Response::default() } @@ -143,26 +165,117 @@ pub trait ReadExecutor { } } -/// A read only delegate of `Peer`. -#[derive(Clone, Debug)] -pub struct ReadDelegate { - pub region: Arc, - pub peer_id: u64, - pub term: u64, - pub applied_index_term: u64, - pub leader_lease: Option, - pub last_valid_ts: Timespec, +/// CachedReadDelegate is a wrapper the ReadDelegate and kv_engine. LocalReader +/// dispatch local read requests to ReadDeleage according to the region_id where +/// ReadDelegate needs kv_engine to read data or fetch snapshot. +pub struct CachedReadDelegate +where + E: KvEngine, +{ + delegate: Arc, + kv_engine: E, +} - pub tag: String, - pub bucket_meta: Option>, - pub txn_extra_op: Arc>, - pub txn_ext: Arc, - pub read_progress: Arc, - pub pending_remove: bool, +impl Deref for CachedReadDelegate +where + E: KvEngine, +{ + type Target = ReadDelegate; - // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` - // up-to-date with the global `ReadDelegate` stored at `StoreMeta` - pub track_ver: TrackVer, + fn deref(&self) -> &Self::Target { + self.delegate.as_ref() + } +} + +impl Clone for CachedReadDelegate +where + E: KvEngine, +{ + fn clone(&self) -> Self { + CachedReadDelegate { + delegate: Arc::clone(&self.delegate), + kv_engine: self.kv_engine.clone(), + } + } +} + +pub struct LocalReadContext<'a, E> +where + E: KvEngine, +{ + read_id: Option, + snap_cache: &'a mut SnapCache, + + // Used when read_id is not set, duplicated definition to avoid cache invalidation in case + // stale read and local read are mixed in one batch. + snapshot: Option>, + snapshot_ts: Option, +} + +impl<'a, E> LocalReadContext<'a, E> +where + E: KvEngine, +{ + fn new(snap_cache: &'a mut SnapCache, read_id: Option) -> Self { + Self { + snap_cache, + read_id, + snapshot: None, + snapshot_ts: None, + } + } + + /// Update the snapshot in the `snap_cache` if the read_id is None or does + /// not match. + fn maybe_update_snapshot(&mut self, engine: &E, delegate_last_valid_ts: Timespec) -> bool { + // When the read_id is None, it means the `snap_cache` has been cleared + // before and the `cached_read_id` of it is None because only a consecutive + // requests will have the same cache and the cache will be cleared after the + // last request of the batch. + if self.read_id.is_some() { + if self.snap_cache.cached_read_id == self.read_id + && self.read_id.as_ref().unwrap().create_time >= delegate_last_valid_ts + { + // Cache hit + return false; + } + + self.snap_cache.cached_read_id = self.read_id.clone(); + self.snap_cache.snapshot = Some(Arc::new(engine.snapshot())); + + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + self.snap_cache.cached_snapshot_ts = monotonic_raw_now(); + } else { + // read_id being None means the snapshot acquired will only be used in this + // request + self.snapshot = Some(Arc::new(engine.snapshot())); + + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + self.snapshot_ts = Some(monotonic_raw_now()); + } + + true + } + + fn snapshot_ts(&self) -> Option { + if self.read_id.is_some() { + Some(self.snap_cache.cached_snapshot_ts) + } else { + self.snapshot_ts + } + } + + // Note: must be called after `maybe_update_snapshot` + fn snapshot(&self) -> Option> { + // read_id being some means we go through cache + if self.read_id.is_some() { + self.snap_cache.snapshot.clone() + } else { + self.snapshot.clone() + } + } } impl Drop for ReadDelegate { @@ -172,6 +285,69 @@ impl Drop for ReadDelegate { } } +/// #[RaftstoreCommon] +pub trait ReadExecutorProvider: Send + Clone + 'static { + type Executor; + type StoreMeta; + + fn store_id(&self) -> Option; + + /// get the ReadDelegate with region_id and the number of delegates in the + /// StoreMeta + fn get_executor_and_len(&self, region_id: u64) -> (usize, Option); +} + +#[derive(Clone)] +pub struct StoreMetaDelegate +where + E: KvEngine, +{ + store_meta: Arc>, + kv_engine: E, +} + +impl StoreMetaDelegate +where + E: KvEngine, +{ + pub fn new(store_meta: Arc>, kv_engine: E) -> Self { + StoreMetaDelegate { + store_meta, + kv_engine, + } + } +} + +impl ReadExecutorProvider for StoreMetaDelegate +where + E: KvEngine, +{ + type Executor = CachedReadDelegate; + type StoreMeta = Arc>; + + fn store_id(&self) -> Option { + self.store_meta.as_ref().lock().unwrap().store_id + } + + /// get the ReadDelegate with region_id and the number of delegates in the + /// StoreMeta + fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { + let meta = self.store_meta.as_ref().lock().unwrap(); + let reader = meta.readers.get(®ion_id).cloned(); + if let Some(reader) = reader { + return ( + meta.readers.len(), + Some(CachedReadDelegate { + delegate: Arc::new(reader), + kv_engine: self.kv_engine.clone(), + }), + ); + } + (meta.readers.len(), None) + } +} + +/// #[RaftstoreCommon] #[derive(Debug)] pub struct TrackVer { version: Arc, @@ -193,14 +369,14 @@ impl TrackVer { } // Take `&mut self` to prevent calling `inc` and `clone` at the same time - fn inc(&mut self) { + pub fn inc(&mut self) { // Only the source `TrackVer` can increase version if self.source { self.version.fetch_add(1, Ordering::Relaxed); } } - fn any_new(&self) -> bool { + pub fn any_new(&self) -> bool { self.version.load(Ordering::Relaxed) > self.local_ver } } @@ -221,8 +397,32 @@ impl Clone for TrackVer { } } +/// #[RaftstoreCommon]: A read only delegate of `Peer`. +#[derive(Clone, Debug)] +pub struct ReadDelegate { + pub region: Arc, + pub peer_id: u64, + pub term: u64, + pub applied_term: u64, + pub leader_lease: Option, + pub last_valid_ts: Timespec, + + pub tag: String, + pub bucket_meta: Option>, + pub txn_extra_op: Arc>, + pub txn_ext: Arc, + pub read_progress: Arc, + pub pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + pub wait_data: bool, + + // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` + // up-to-date with the global `ReadDelegate` stored at `StoreMeta` + pub track_ver: TrackVer, +} + impl ReadDelegate { - pub fn from_peer(peer: &Peer) -> ReadDelegate { + pub fn from_peer(peer: &Peer) -> Self { let region = peer.region().clone(); let region_id = region.get_id(); let peer_id = peer.peer.get_id(); @@ -230,7 +430,7 @@ impl ReadDelegate { region: Arc::new(region), peer_id, term: peer.term(), - applied_index_term: peer.get_store().applied_index_term(), + applied_term: peer.get_store().applied_term(), leader_lease: None, last_valid_ts: Timespec::new(0, 0), tag: format!("[region {}] {}", region_id, peer_id), @@ -238,12 +438,42 @@ impl ReadDelegate { txn_ext: peer.txn_ext.clone(), read_progress: peer.read_progress.clone(), pending_remove: false, + wait_data: false, bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), track_ver: TrackVer::new(), } } - fn fresh_valid_ts(&mut self) { + pub fn new( + peer_id: u64, + term: u64, + region: Region, + applied_term: u64, + txn_extra_op: Arc>, + txn_ext: Arc, + read_progress: Arc, + bucket_meta: Option>, + ) -> Self { + let region_id = region.id; + ReadDelegate { + region: Arc::new(region), + peer_id, + term, + applied_term, + leader_lease: None, + last_valid_ts: Timespec::new(0, 0), + tag: format!("[region {}] {}", region_id, peer_id), + txn_extra_op, + txn_ext, + read_progress, + pending_remove: false, + wait_data: false, + bucket_meta, + track_ver: TrackVer::new(), + } + } + + pub fn fresh_valid_ts(&mut self) { self.last_valid_ts = monotonic_raw_now(); } @@ -262,8 +492,8 @@ impl ReadDelegate { Progress::Term(term) => { self.term = term; } - Progress::AppliedIndexTerm(applied_index_term) => { - self.applied_index_term = applied_index_term; + Progress::AppliedTerm(applied_term) => { + self.applied_term = applied_term; } Progress::LeaderLease(leader_lease) => { self.leader_lease = Some(leader_lease); @@ -271,26 +501,31 @@ impl ReadDelegate { Progress::RegionBuckets(bucket_meta) => { self.bucket_meta = Some(bucket_meta); } + Progress::WaitData(wait_data) => { + self.wait_data = wait_data; + } } } + pub fn need_renew_lease(&self, ts: Timespec) -> bool { + self.leader_lease + .as_ref() + .map(|lease| lease.need_renew(ts)) + .unwrap_or(false) + } + // If the remote lease will be expired in near future send message - // to `raftstore` renew it - fn maybe_renew_lease_advance( + // to `raftstore` to renew it + pub fn maybe_renew_lease_advance( &self, router: &dyn CasualRouter, ts: Timespec, - metrics: &mut ReadMetrics, ) { - if !self - .leader_lease - .as_ref() - .map(|lease| lease.need_renew(ts)) - .unwrap_or(false) - { + if !self.need_renew_lease(ts) { return; } - metrics.renew_lease_advance += 1; + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); let region_id = self.region.get_id(); if let Err(e) = router.send(region_id, CasualMessage::RenewLease) { debug!( @@ -301,18 +536,22 @@ impl ReadDelegate { } } - fn is_in_leader_lease(&self, ts: Timespec, metrics: &mut ReadMetrics) -> bool { + pub fn is_in_leader_lease(&self, ts: Timespec) -> bool { + fail_point!("perform_read_local", |_| true); + if let Some(ref lease) = self.leader_lease { let term = lease.term(); if term == self.term { if lease.inspect(Some(ts)) == LeaseState::Valid { + fail_point!("after_pass_lease_check"); return true; } else { - metrics.rejected_by_lease_expire += 1; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().reject_reason.lease_expire.inc()); debug!("rejected by lease expire"; "tag" => &self.tag); } } else { - metrics.rejected_by_term_mismatch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.term_mismatch.inc()); debug!("rejected by term mismatch"; "tag" => &self.tag); } } @@ -320,45 +559,43 @@ impl ReadDelegate { false } - fn check_stale_read_safe( - &self, - read_ts: u64, - metrics: &mut ReadMetrics, - ) -> std::result::Result<(), ReadResponse> { + pub fn check_stale_read_safe(&self, read_ts: u64) -> std::result::Result<(), RaftCmdResponse> { let safe_ts = self.read_progress.safe_ts(); if safe_ts >= read_ts { return Ok(()); } + // Advancing resolved ts may be expensive, only notify if read_ts - safe_ts > + // 200ms. + if TimeStamp::from(read_ts).physical() > TimeStamp::from(safe_ts).physical() + 200 { + self.read_progress.notify_advance_resolved_ts(); + } debug!( "reject stale read by safe ts"; - "tag" => &self.tag, - "safe ts" => safe_ts, - "read ts" => read_ts + "safe_ts" => safe_ts, + "read_ts" => read_ts, + "region_id" => self.region.get_id(), + "peer_id" => self.peer_id, ); - metrics.rejected_by_safe_timestamp += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.safe_ts.inc()); let mut response = cmd_resp::new_error(Error::DataIsNotReady { region_id: self.region.get_id(), peer_id: self.peer_id, safe_ts, }); cmd_resp::bind_term(&mut response, self.term); - Err(ReadResponse { - response, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }) + Err(response) } /// Used in some external tests. pub fn mock(region_id: u64) -> Self { let mut region: metapb::Region = Default::default(); region.set_id(region_id); - let read_progress = Arc::new(RegionReadProgress::new(®ion, 0, 0, "mock".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)); ReadDelegate { region: Arc::new(region), peer_id: 1, term: 1, - applied_index_term: 1, + applied_term: 1, leader_lease: None, last_valid_ts: Timespec::new(0, 0), tag: format!("[region {}] {}", region_id, 1), @@ -366,6 +603,7 @@ impl ReadDelegate { txn_ext: Default::default(), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, } @@ -377,23 +615,25 @@ impl Display for ReadDelegate { write!( f, "ReadDelegate for region {}, \ - leader {} at term {}, applied_index_term {}, has lease {}", + leader {} at term {}, applied_term {}, has lease {}", self.region.get_id(), self.peer_id, self.term, - self.applied_index_term, + self.applied_term, self.leader_lease.is_some(), ) } } +/// #[RaftstoreCommon] #[derive(Debug)] pub enum Progress { Region(metapb::Region), Term(u64), - AppliedIndexTerm(u64), + AppliedTerm(u64), LeaderLease(RemoteLease), RegionBuckets(Arc), + WaitData(bool), } impl Progress { @@ -405,8 +645,8 @@ impl Progress { Progress::Term(term) } - pub fn applied_index_term(applied_index_term: u64) -> Progress { - Progress::AppliedIndexTerm(applied_index_term) + pub fn applied_term(applied_term: u64) -> Progress { + Progress::AppliedTerm(applied_term) } pub fn leader_lease(lease: RemoteLease) -> Progress { @@ -416,131 +656,100 @@ impl Progress { pub fn region_buckets(bucket_meta: Arc) -> Progress { Progress::RegionBuckets(bucket_meta) } + + pub fn wait_data(wait_data: bool) -> Progress { + Progress::WaitData(wait_data) + } } -pub struct LocalReader +struct SnapCache where - C: ProposalRouter + CasualRouter, E: KvEngine, { - store_id: Cell>, - store_meta: Arc>, - kv_engine: E, - metrics: ReadMetrics, - // region id -> ReadDelegate - // The use of `Arc` here is a workaround, see the comment at `get_delegate` - delegates: LruCache>, - snap_cache: Option>, - cache_read_id: ThreadReadId, - // A channel to raftstore. - router: C, + cached_read_id: Option, + snapshot: Option>, + cached_snapshot_ts: Timespec, } -impl ReadExecutor for LocalReader +impl SnapCache where - C: ProposalRouter + CasualRouter, E: KvEngine, { - fn get_engine(&self) -> &E { - &self.kv_engine + fn new() -> Self { + SnapCache { + cached_read_id: None, + snapshot: None, + cached_snapshot_ts: Timespec::new(0, 0), + } } - fn get_snapshot(&mut self, create_time: Option) -> Arc { - self.metrics.local_executed_requests += 1; - if let Some(ts) = create_time { - if ts == self.cache_read_id { - if let Some(snap) = self.snap_cache.as_ref() { - self.metrics.local_executed_snapshot_cache_hit += 1; - return snap.clone(); - } - } - let snap = Arc::new(self.kv_engine.snapshot()); - self.cache_read_id = ts; - self.snap_cache = Some(snap.clone()); - return snap; - } - Arc::new(self.kv_engine.snapshot()) + fn clear(&mut self) { + self.cached_read_id.take(); + self.snapshot.take(); } } -impl LocalReader +impl Clone for SnapCache where - C: ProposalRouter + CasualRouter, E: KvEngine, { - pub fn new(kv_engine: E, store_meta: Arc>, router: C) -> Self { - let cache_read_id = ThreadReadId::new(); - LocalReader { + fn clone(&self) -> Self { + Self { + cached_read_id: self.cached_read_id.clone(), + snapshot: self.snapshot.clone(), + cached_snapshot_ts: self.cached_snapshot_ts, + } + } +} + +/// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the +/// relevant regions by LocalReader so that these requests can be handled by the +/// relevant ReadDelegate respectively. +pub struct LocalReaderCore { + pub store_id: Cell>, + store_meta: S, + pub delegates: LruCache, +} + +impl LocalReaderCore +where + D: Deref + Clone, + S: ReadExecutorProvider, +{ + pub fn new(store_meta: S) -> Self { + LocalReaderCore { store_meta, - kv_engine, - router, - snap_cache: None, - cache_read_id, store_id: Cell::new(None), - metrics: Default::default(), delegates: LruCache::with_capacity_and_sample(0, 7), } } - fn redirect(&mut self, mut cmd: RaftCommand) { - debug!("localreader redirects command"; "command" => ?cmd); - let region_id = cmd.request.get_header().get_region_id(); - let mut err = errorpb::Error::default(); - match ProposalRouter::send(&self.router, cmd) { - Ok(()) => return, - Err(TrySendError::Full(c)) => { - self.metrics.rejected_by_channel_full += 1; - err.set_message(RAFTSTORE_IS_BUSY.to_owned()); - err.mut_server_is_busy() - .set_reason(RAFTSTORE_IS_BUSY.to_owned()); - cmd = c; - } - Err(TrySendError::Disconnected(c)) => { - self.metrics.rejected_by_no_region += 1; - err.set_message(format!("region {} is missing", region_id)); - err.mut_region_not_found().set_region_id(region_id); - cmd = c; - } - } - - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - let read_resp = ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }; - - cmd.callback.invoke_read(read_resp); + pub fn store_meta(&self) -> &S { + &self.store_meta } - // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the lifetime of - // the returned `&ReadDelegate` will bind to `self`, and make it impossible to use `&mut self` - // while the `&ReadDelegate` is alive, a better choice is use `Rc` but `LocalReader: Send` will be - // violated, which is required by `LocalReadRouter: Send`, use `Arc` will introduce extra cost but + // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the + // lifetime of the returned `&ReadDelegate` will bind to `self`, and make it + // impossible to use `&mut self` while the `&ReadDelegate` is alive, a better + // choice is use `Rc` but `LocalReader: Send` will be violated, which is + // required by `LocalReadRouter: Send`, use `Arc` will introduce extra cost but // make the logic clear - fn get_delegate(&mut self, region_id: u64) -> Option> { + pub fn get_delegate(&mut self, region_id: u64) -> Option { let rd = match self.delegates.get(®ion_id) { // The local `ReadDelegate` is up to date - Some(d) if !d.track_ver.any_new() => Some(Arc::clone(d)), + Some(d) if !d.track_ver.any_new() => Some(d.clone()), _ => { debug!("update local read delegate"; "region_id" => region_id); - self.metrics.rejected_by_cache_miss += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.cache_miss.inc()); - let (meta_len, meta_reader) = { - let meta = self.store_meta.lock().unwrap(); - ( - meta.readers.len(), - meta.readers.get(®ion_id).cloned().map(Arc::new), - ) - }; + let (meta_len, meta_reader) = { self.store_meta.get_executor_and_len(region_id) }; // Remove the stale delegate self.delegates.remove(®ion_id); self.delegates.resize(meta_len); match meta_reader { Some(reader) => { - self.delegates.insert(region_id, Arc::clone(&reader)); + self.delegates.insert(region_id, reader.clone()); Some(reader) } None => None, @@ -551,19 +760,16 @@ where rd.filter(|r| !r.pending_remove) } - fn pre_propose_raft_command( - &mut self, - req: &RaftCmdRequest, - ) -> Result, RequestPolicy)>> { + pub fn validate_request(&mut self, req: &RaftCmdRequest) -> Result> { // Check store id. if self.store_id.get().is_none() { - let store_id = self.store_meta.lock().unwrap().store_id; + let store_id = self.store_meta.store_id(); self.store_id.set(store_id); } let store_id = self.store_id.get().unwrap(); - if let Err(e) = util::check_store_id(req, store_id) { - self.metrics.rejected_by_store_id_mismatch += 1; + if let Err(e) = util::check_store_id(req.get_header(), store_id) { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.store_id_mismatch.inc()); debug!("rejected by store id not match"; "err" => %e); return Err(e); } @@ -573,7 +779,7 @@ where let delegate = match self.get_delegate(region_id) { Some(d) => d, None => { - self.metrics.rejected_by_no_region += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); debug!("rejected by no region"; "region_id" => region_id); return Ok(None); } @@ -582,121 +788,261 @@ where fail_point!("localreader_on_find_delegate"); // Check peer id. - if let Err(e) = util::check_peer_id(req, delegate.peer_id) { - self.metrics.rejected_by_peer_id_mismatch += 1; + if let Err(e) = util::check_peer_id(req.get_header(), delegate.peer_id) { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.peer_id_mismatch.inc()); return Err(e); } // Check term. - if let Err(e) = util::check_term(req, delegate.term) { + if let Err(e) = util::check_term(req.get_header(), delegate.term) { debug!( "check term"; "delegate_term" => delegate.term, "header_term" => req.get_header().get_term(), ); - self.metrics.rejected_by_term_mismatch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.term_mismatch.inc()); return Err(e); } // Check region epoch. - if util::check_region_epoch(req, &delegate.region, false).is_err() { - self.metrics.rejected_by_epoch += 1; + if util::check_req_region_epoch(req, &delegate.region, false).is_err() { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.epoch.inc()); // Stale epoch, redirect it to raftstore to get the latest region. debug!("rejected by epoch not match"; "tag" => &delegate.tag); return Ok(None); } - let mut inspector = Inspector { - delegate: &delegate, - metrics: &mut self.metrics, - }; - match inspector.inspect(req) { - Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), - Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), - // It can not handle other policies. - Ok(_) => Ok(None), - Err(e) => Err(e), + // Check witness + if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::IsWitness(region_id)); + } + + // Check non-witness hasn't finish applying snapshot yet. + if delegate.wait_data { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.wait_data.inc()); + return Err(Error::IsWitness(region_id)); + } + + // Check whether the region is in the flashback state and the local read could + // be performed. + let is_in_flashback = delegate.region.is_in_flashback; + let flashback_start_ts = delegate.region.flashback_start_ts; + if let Err(e) = + util::check_flashback_state(is_in_flashback, flashback_start_ts, req, region_id, false) + { + TLS_LOCAL_READ_METRICS.with(|m| match e { + Error::FlashbackNotPrepared(_) => { + m.borrow_mut().reject_reason.flashback_not_prepared.inc() + } + Error::FlashbackInProgress(..) => { + m.borrow_mut().reject_reason.flashback_in_progress.inc() + } + _ => unreachable!(), + }); + debug!("rejected by flashback state"; "is_in_flashback" => is_in_flashback, "tag" => &delegate.tag); + return Ok(None); + } + + Ok(Some(delegate)) + } +} + +impl Clone for LocalReaderCore +where + S: Clone, +{ + fn clone(&self) -> Self { + LocalReaderCore { + store_meta: self.store_meta.clone(), + store_id: self.store_id.clone(), + delegates: LruCache::with_capacity_and_sample(0, 7), } } +} + +pub struct LocalReader +where + E: KvEngine, + C: ProposalRouter + CasualRouter, +{ + local_reader: LocalReaderCore, StoreMetaDelegate>, + kv_engine: E, + snap_cache: SnapCache, + // A channel to raftstore. + router: C, +} + +impl LocalReader +where + E: KvEngine, + C: ProposalRouter + CasualRouter, +{ + pub fn new(kv_engine: E, store_meta: StoreMetaDelegate, router: C) -> Self { + Self { + local_reader: LocalReaderCore::new(store_meta), + kv_engine, + snap_cache: SnapCache::new(), + router, + } + } + + pub fn pre_propose_raft_command( + &mut self, + req: &RaftCmdRequest, + ) -> Result, RequestPolicy)>> { + if let Some(delegate) = self.local_reader.validate_request(req)? { + let mut inspector = Inspector { + delegate: &delegate, + }; + match inspector.inspect(req) { + Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), + Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), + // It can not handle other policies. + Ok(_) => Ok(None), + Err(e) => Err(e), + } + } else { + Ok(None) + } + } + + fn redirect(&mut self, mut cmd: RaftCommand) { + debug!("localreader redirects command"; "command" => ?cmd); + let region_id = cmd.request.get_header().get_region_id(); + let mut err = errorpb::Error::default(); + match ProposalRouter::send(&self.router, cmd) { + Ok(()) => return, + Err(TrySendError::Full(c)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); + err.set_message(RAFTSTORE_IS_BUSY.to_owned()); + err.mut_server_is_busy() + .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + cmd = c; + } + Err(TrySendError::Disconnected(c)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + err.set_message(format!("region {} is missing", region_id)); + err.mut_region_not_found().set_region_id(region_id); + cmd = c; + } + } + + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + let read_resp = ReadResponse { + response: resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }; + + cmd.callback.set_result(read_resp); + } pub fn propose_raft_command( &mut self, - mut read_id: Option, + read_id: Option, req: RaftCmdRequest, cb: Callback, ) { match self.pre_propose_raft_command(&req) { - Ok(Some((delegate, policy))) => { + Ok(Some((mut delegate, policy))) => { + let snap_updated; + let last_valid_ts = delegate.last_valid_ts; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { - let snapshot_ts = match read_id.as_mut() { - // If this peer became Leader not long ago and just after the cached - // snapshot was created, this snapshot can not see all data of the peer. - Some(id) => { - if id.create_time <= delegate.last_valid_ts { - id.create_time = monotonic_raw_now(); - } - id.create_time - } - None => monotonic_raw_now(), - }; - if !delegate.is_in_leader_lease(snapshot_ts, &mut self.metrics) { + let mut local_read_ctx = + LocalReadContext::new(&mut self.snap_cache, read_id); + + snap_updated = local_read_ctx + .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); + + let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); + if !delegate.is_in_leader_lease(snapshot_ts) { + fail_point!("localreader_before_redirect", |_| {}); // Forward to raftstore. self.redirect(RaftCommand::new(req, cb)); return; } - let response = self.execute(&req, &delegate.region, None, read_id); + + let region = Arc::clone(&delegate.region); + let mut response = + delegate.execute(&req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } // Try renew lease in advance - delegate.maybe_renew_lease_advance( - &self.router, - snapshot_ts, - &mut self.metrics, - ); + delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); response } // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - assert!(read_ts > 0); - if let Err(resp) = - delegate.check_stale_read_safe(read_ts, &mut self.metrics) - { - cb.invoke_read(resp); + if let Err(resp) = delegate.check_stale_read_safe(read_ts) { + cb.set_result(ReadResponse { + response: resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); return; } - // Getting the snapshot - let response = self.execute(&req, &delegate.region, None, read_id); + // Stale read does not use cache, so we pass None for read_id + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); + snap_updated = local_read_ctx + .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - // Double check in case `safe_ts` change after the first check and before getting snapshot - if let Err(resp) = - delegate.check_stale_read_safe(read_ts, &mut self.metrics) - { - cb.invoke_read(resp); + let region = Arc::clone(&delegate.region); + // Getting the snapshot + let mut response = + delegate.execute(&req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Double check in case `safe_ts` change after the first check and before + // getting snapshot + if let Err(resp) = delegate.check_stale_read_safe(read_ts) { + cb.set_result(ReadResponse { + response: resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); return; } - self.metrics.local_executed_stale_read_requests += 1; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); response } _ => unreachable!(), }; + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + if !snap_updated { + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_snapshot_cache_hit.inc()); + } + cmd_resp::bind_term(&mut response.response, delegate.term); if let Some(snap) = response.snapshot.as_mut() { snap.txn_ext = Some(delegate.txn_ext.clone()); snap.bucket_meta = delegate.bucket_meta.clone(); } response.txn_extra_op = delegate.txn_extra_op.load(); - cb.invoke_read(response); + cb.set_result(response); } // Forward to raftstore. Ok(None) => self.redirect(RaftCommand::new(req, cb)), Err(e) => { let mut response = cmd_resp::new_error(e); - if let Some(delegate) = self.delegates.get(&req.get_header().get_region_id()) { + if let Some(delegate) = self + .local_reader + .delegates + .get(&req.get_header().get_region_id()) + { cmd_resp::bind_term(&mut response, delegate.term); } - cb.invoke_read(ReadResponse { + cb.set_result(ReadResponse { response, snapshot: None, txn_extra_op: TxnExtraOp::Noop, @@ -705,11 +1051,12 @@ where } } - /// If read requests are received at the same RPC request, we can create one snapshot for all - /// of them and check whether the time when the snapshot was created is in lease. We use - /// ThreadReadId to figure out whether this RaftCommand comes from the same RPC request with - /// the last RaftCommand which left a snapshot cached in LocalReader. ThreadReadId is composed - /// by thread_id and a thread_local incremental sequence. + /// If read requests are received at the same RPC request, we can create one + /// snapshot for all of them and check whether the time when the snapshot + /// was created is in lease. We use ThreadReadId to figure out whether this + /// RaftCommand comes from the same RPC request with the last RaftCommand + /// which left a snapshot cached in LocalReader. ThreadReadId is composed by + /// thread_id and a thread_local incremental sequence. #[inline] pub fn read( &mut self, @@ -718,210 +1065,88 @@ where cb: Callback, ) { self.propose_raft_command(read_id, req, cb); - self.metrics.maybe_flush(); + maybe_tls_local_read_metrics_flush(); } pub fn release_snapshot_cache(&mut self) { - self.snap_cache.take(); + self.snap_cache.clear(); } } -impl Clone for LocalReader +impl Clone for LocalReader where - C: ProposalRouter + CasualRouter + Clone, E: KvEngine, + C: ProposalRouter + CasualRouter + Clone, { fn clone(&self) -> Self { - LocalReader { - store_meta: self.store_meta.clone(), + Self { + local_reader: self.local_reader.clone(), kv_engine: self.kv_engine.clone(), - router: self.router.clone(), - store_id: self.store_id.clone(), - metrics: Default::default(), - delegates: LruCache::with_capacity_and_sample(0, 7), snap_cache: self.snap_cache.clone(), - cache_read_id: self.cache_read_id.clone(), + router: self.router.clone(), } } } -struct Inspector<'r, 'm> { +impl ReadExecutor for CachedReadDelegate +where + E: KvEngine, +{ + type Tablet = E; + + fn get_tablet(&mut self) -> &E { + &self.kv_engine + } + + fn get_snapshot(&mut self, read_context: &Option>) -> Arc { + read_context.as_ref().unwrap().snapshot().unwrap() + } +} + +/// #[RaftstoreCommon] +struct Inspector<'r> { delegate: &'r ReadDelegate, - metrics: &'m mut ReadMetrics, } -impl<'r, 'm> RequestInspector for Inspector<'r, 'm> { +impl<'r> RequestInspector for Inspector<'r> { fn has_applied_to_current_term(&mut self) -> bool { - if self.delegate.applied_index_term == self.delegate.term { + if self.delegate.applied_term == self.delegate.term { true } else { debug!( "rejected by term check"; "tag" => &self.delegate.tag, - "applied_index_term" => self.delegate.applied_index_term, + "applied_term" => self.delegate.applied_term, "delegate_term" => ?self.delegate.term, ); // only for metric. - self.metrics.rejected_by_applied_term += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.applied_term.inc()); false } } - fn inspect_lease(&mut self) -> LeaseState { - // TODO: disable localreader if we did not enable raft's check_quorum. - if self.delegate.leader_lease.is_some() { - // We skip lease check, because it is postponed until `handle_read`. - LeaseState::Valid - } else { - debug!("rejected by leader lease"; "tag" => &self.delegate.tag); - self.metrics.rejected_by_no_lease += 1; - LeaseState::Expired - } - } -} - -const METRICS_FLUSH_INTERVAL: u64 = 15_000; // 15s - -#[derive(Clone)] -struct ReadMetrics { - local_executed_requests: u64, - local_executed_stale_read_requests: u64, - local_executed_snapshot_cache_hit: u64, - // TODO: record rejected_by_read_quorum. - rejected_by_store_id_mismatch: u64, - rejected_by_peer_id_mismatch: u64, - rejected_by_term_mismatch: u64, - rejected_by_lease_expire: u64, - rejected_by_no_region: u64, - rejected_by_no_lease: u64, - rejected_by_epoch: u64, - rejected_by_applied_term: u64, - rejected_by_channel_full: u64, - rejected_by_cache_miss: u64, - rejected_by_safe_timestamp: u64, - renew_lease_advance: u64, - - last_flush_time: Instant, -} - -impl Default for ReadMetrics { - fn default() -> ReadMetrics { - ReadMetrics { - local_executed_requests: 0, - local_executed_stale_read_requests: 0, - local_executed_snapshot_cache_hit: 0, - rejected_by_store_id_mismatch: 0, - rejected_by_peer_id_mismatch: 0, - rejected_by_term_mismatch: 0, - rejected_by_lease_expire: 0, - rejected_by_no_region: 0, - rejected_by_no_lease: 0, - rejected_by_epoch: 0, - rejected_by_applied_term: 0, - rejected_by_channel_full: 0, - rejected_by_cache_miss: 0, - rejected_by_safe_timestamp: 0, - renew_lease_advance: 0, - last_flush_time: Instant::now(), - } - } -} - -impl ReadMetrics { - pub fn maybe_flush(&mut self) { - if self.last_flush_time.saturating_elapsed() - >= Duration::from_millis(METRICS_FLUSH_INTERVAL) - { - self.flush(); - self.last_flush_time = Instant::now(); - } - } - - fn flush(&mut self) { - if self.rejected_by_store_id_mismatch > 0 { - LOCAL_READ_REJECT - .store_id_mismatch - .inc_by(self.rejected_by_store_id_mismatch); - self.rejected_by_store_id_mismatch = 0; - } - if self.rejected_by_peer_id_mismatch > 0 { - LOCAL_READ_REJECT - .peer_id_mismatch - .inc_by(self.rejected_by_peer_id_mismatch); - self.rejected_by_peer_id_mismatch = 0; - } - if self.rejected_by_term_mismatch > 0 { - LOCAL_READ_REJECT - .term_mismatch - .inc_by(self.rejected_by_term_mismatch); - self.rejected_by_term_mismatch = 0; - } - if self.rejected_by_lease_expire > 0 { - LOCAL_READ_REJECT - .lease_expire - .inc_by(self.rejected_by_lease_expire); - self.rejected_by_lease_expire = 0; - } - if self.rejected_by_no_region > 0 { - LOCAL_READ_REJECT - .no_region - .inc_by(self.rejected_by_no_region); - self.rejected_by_no_region = 0; - } - if self.rejected_by_no_lease > 0 { - LOCAL_READ_REJECT.no_lease.inc_by(self.rejected_by_no_lease); - self.rejected_by_no_lease = 0; - } - if self.rejected_by_epoch > 0 { - LOCAL_READ_REJECT.epoch.inc_by(self.rejected_by_epoch); - self.rejected_by_epoch = 0; - } - if self.rejected_by_applied_term > 0 { - LOCAL_READ_REJECT - .applied_term - .inc_by(self.rejected_by_applied_term); - self.rejected_by_applied_term = 0; - } - if self.rejected_by_channel_full > 0 { - LOCAL_READ_REJECT - .channel_full - .inc_by(self.rejected_by_channel_full); - self.rejected_by_channel_full = 0; - } - if self.rejected_by_safe_timestamp > 0 { - LOCAL_READ_REJECT - .safe_ts - .inc_by(self.rejected_by_safe_timestamp); - self.rejected_by_safe_timestamp = 0; - } - if self.local_executed_snapshot_cache_hit > 0 { - LOCAL_READ_EXECUTED_CACHE_REQUESTS.inc_by(self.local_executed_snapshot_cache_hit); - self.local_executed_snapshot_cache_hit = 0; - } - if self.local_executed_requests > 0 { - LOCAL_READ_EXECUTED_REQUESTS.inc_by(self.local_executed_requests); - self.local_executed_requests = 0; - } - if self.local_executed_stale_read_requests > 0 { - LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.inc_by(self.local_executed_stale_read_requests); - self.local_executed_stale_read_requests = 0; - } - if self.renew_lease_advance > 0 { - LOCAL_READ_RENEW_LEASE_ADVANCE_COUNTER.inc_by(self.renew_lease_advance); - self.renew_lease_advance = 0; + fn inspect_lease(&mut self) -> LeaseState { + // TODO: disable localreader if we did not enable raft's check_quorum. + if self.delegate.leader_lease.is_some() { + // We skip lease check, because it is postponed until `handle_read`. + LeaseState::Valid + } else { + debug!("rejected by leader lease"; "tag" => &self.delegate.tag); + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_lease.inc()); + LeaseState::Expired } } } #[cfg(test)] mod tests { - use std::{sync::mpsc::*, thread}; + use std::{ops::Add, sync::mpsc::*, thread}; use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; - use engine_traits::ALL_CFS; - use kvproto::raft_cmdpb::*; + use engine_traits::{MiscExt, Peekable, SyncMutable, ALL_CFS}; + use kvproto::{metapb::RegionEpoch, raft_cmdpb::*}; use tempfile::{Builder, TempDir}; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; use time::Duration; @@ -977,15 +1202,14 @@ mod tests { store_meta: Arc>, ) -> ( TempDir, - LocalReader, + LocalReader, Receiver>, ) { let path = Builder::new().prefix(path).tempdir().unwrap(); - let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None) - .unwrap(); + let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let (ch, rx, _) = MockRouter::new(); - let mut reader = LocalReader::new(db, store_meta, ch); - reader.store_id = Cell::new(Some(store_id)); + let mut reader = LocalReader::new(db.clone(), StoreMetaDelegate::new(store_meta, db), ch); + reader.local_reader.store_id = Cell::new(Some(store_id)); (path, reader, rx) } @@ -1002,14 +1226,14 @@ mod tests { } fn must_redirect( - reader: &mut LocalReader, + reader: &mut LocalReader, rx: &Receiver>, cmd: RaftCmdRequest, ) { reader.propose_raft_command( None, cmd.clone(), - Callback::Read(Box::new(|resp| { + Callback::read(Box::new(|resp| { panic!("unexpected invoke, {:?}", resp); })), ); @@ -1022,11 +1246,20 @@ mod tests { } fn must_not_redirect( - reader: &mut LocalReader, + reader: &mut LocalReader, + rx: &Receiver>, + task: RaftCommand, + ) { + must_not_redirect_with_read_id(reader, rx, task, None); + } + + fn must_not_redirect_with_read_id( + reader: &mut LocalReader, rx: &Receiver>, task: RaftCommand, + read_id: Option, ) { - reader.propose_raft_command(None, task.request, task.callback); + reader.propose_raft_command(read_id, task.request, task.callback); assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); } @@ -1056,7 +1289,7 @@ mod tests { region1.set_region_epoch(epoch13.clone()); let term6 = 6; let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. - let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); let mut cmd = RaftCmdRequest::default(); let mut header = RaftRequestHeader::default(); @@ -1071,14 +1304,20 @@ mod tests { // The region is not register yet. must_redirect(&mut reader, &rx, cmd.clone()); - assert_eq!(reader.metrics.rejected_by_no_region, 1); - assert_eq!(reader.metrics.rejected_by_cache_miss, 1); - assert!(reader.delegates.get(&1).is_none()); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 1 + ); + assert!(reader.local_reader.delegates.get(&1).is_none()); // Register region 1 lease.renew(monotonic_raw_now()); let remote = lease.maybe_new_remote_lease(term6).unwrap(); - // But the applied_index_term is stale. + // But the applied_term is stale. { let mut meta = store_meta.lock().unwrap(); let read_delegate = ReadDelegate { @@ -1086,39 +1325,49 @@ mod tests { region: Arc::new(region1.clone()), peer_id: leader2.get_id(), term: term6, - applied_index_term: term6 - 1, + applied_term: term6 - 1, leader_lease: Some(remote), last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), txn_ext: Arc::new(TxnExt::default()), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; meta.readers.insert(1, read_delegate); } - // The applied_index_term is stale + // The applied_term is stale must_redirect(&mut reader, &rx, cmd.clone()); - assert_eq!(reader.metrics.rejected_by_cache_miss, 2); - assert_eq!(reader.metrics.rejected_by_applied_term, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 2 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.applied_term.get()), + 1 + ); - // Make the applied_index_term matches current term. - let pg = Progress::applied_index_term(term6); + // Make the applied_term matches current term. + let pg = Progress::applied_term(term6); { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(pg); } let task = - RaftCommand::::new(cmd.clone(), Callback::Read(Box::new(move |_| {}))); + RaftCommand::::new(cmd.clone(), Callback::read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_cache_miss, 3); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 3 + ); // Let's read. let task = RaftCommand::::new( cmd.clone(), - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let snap = resp.snapshot.unwrap(); assert_eq!(snap.get_region(), ®ion1); })), @@ -1128,7 +1377,10 @@ mod tests { // Wait for expiration. thread::sleep(Duration::seconds(1).to_std().unwrap()); must_redirect(&mut reader, &rx, cmd.clone()); - assert_eq!(reader.metrics.rejected_by_lease_expire, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); // Renew lease. lease.renew(monotonic_raw_now()); @@ -1142,14 +1394,20 @@ mod tests { reader.propose_raft_command( None, cmd_store_id, - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_store_not_match()); assert!(resp.snapshot.is_none()); })), ); - assert_eq!(reader.metrics.rejected_by_store_id_mismatch, 1); - assert_eq!(reader.metrics.rejected_by_cache_miss, 3); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.store_id_mismatch.get()), + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 3 + ); // metapb::Peer id mismatch. let mut cmd_peer_id = cmd.clone(); @@ -1160,7 +1418,7 @@ mod tests { reader.propose_raft_command( None, cmd_peer_id, - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { assert!( resp.response.get_header().has_error(), "{:?}", @@ -1169,7 +1427,10 @@ mod tests { assert!(resp.snapshot.is_none()); })), ); - assert_eq!(reader.metrics.rejected_by_peer_id_mismatch, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.peer_id_mismatch.get()), + 1 + ); // Read quorum. let mut cmd_read_quorum = cmd.clone(); @@ -1182,13 +1443,16 @@ mod tests { reader.propose_raft_command( None, cmd_term, - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_stale_command(), "{:?}", resp); assert!(resp.snapshot.is_none()); })), ); - assert_eq!(reader.metrics.rejected_by_term_mismatch, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.term_mismatch.get()), + 1 + ); // Stale epoch. let mut epoch12 = epoch13; @@ -1196,15 +1460,19 @@ mod tests { let mut cmd_epoch = cmd.clone(); cmd_epoch.mut_header().set_region_epoch(epoch12); must_redirect(&mut reader, &rx, cmd_epoch); - assert_eq!(reader.metrics.rejected_by_epoch, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.epoch.get()), + 1 + ); // Expire lease manually, and it can not be renewed. - let previous_lease_rejection = reader.metrics.rejected_by_lease_expire; + let previous_lease_rejection = + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()); lease.expire(); lease.renew(monotonic_raw_now()); must_redirect(&mut reader, &rx, cmd.clone()); assert_eq!( - reader.metrics.rejected_by_lease_expire, + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), previous_lease_rejection + 1 ); @@ -1213,7 +1481,7 @@ mod tests { reader.propose_raft_command( None, cmd.clone(), - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_server_is_busy(), "{:?}", resp); assert!(resp.snapshot.is_none()); @@ -1221,10 +1489,14 @@ mod tests { ); rx.try_recv().unwrap(); assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); - assert_eq!(reader.metrics.rejected_by_channel_full, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.channel_full.get()), + 1 + ); // Reject by term mismatch in lease. - let previous_term_rejection = reader.metrics.rejected_by_term_mismatch; + let previous_term_rejection = + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.term_mismatch.get()); let mut cmd9 = cmd.clone(); cmd9.mut_header().set_term(term6 + 3); { @@ -1236,12 +1508,12 @@ mod tests { meta.readers .get_mut(&1) .unwrap() - .update(Progress::applied_index_term(term6 + 3)); + .update(Progress::applied_term(term6 + 3)); } reader.propose_raft_command( None, cmd9.clone(), - Callback::Read(Box::new(|resp| { + Callback::read(Box::new(|resp| { panic!("unexpected invoke, {:?}", resp); })), ); @@ -1252,10 +1524,13 @@ mod tests { cmd9 ); assert_eq!( - reader.metrics.rejected_by_term_mismatch, - previous_term_rejection + 1, + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.term_mismatch.get()), + previous_term_rejection + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 4 ); - assert_eq!(reader.metrics.rejected_by_cache_miss, 4); // Stale local ReadDelegate cmd.mut_header().set_term(term6 + 3); @@ -1267,12 +1542,18 @@ mod tests { meta.readers.get_mut(&1).unwrap().update(pg); } let task = - RaftCommand::::new(cmd.clone(), Callback::Read(Box::new(move |_| {}))); + RaftCommand::::new(cmd.clone(), Callback::read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_cache_miss, 5); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 5 + ); // Stale read - assert_eq!(reader.metrics.rejected_by_safe_timestamp, 0); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 0 + ); read_progress.update_safe_ts(1, 1); assert_eq!(read_progress.safe_ts(), 1); @@ -1286,33 +1567,39 @@ mod tests { cmd.mut_header().set_flag_data(data.into()); let task = RaftCommand::::new( cmd.clone(), - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_data_is_not_ready()); assert!(resp.snapshot.is_none()); })), ); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_safe_timestamp, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 1 + ); read_progress.update_safe_ts(1, 2); assert_eq!(read_progress.safe_ts(), 2); - let task = RaftCommand::::new(cmd, Callback::Read(Box::new(move |_| {}))); + let task = RaftCommand::::new(cmd, Callback::read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_safe_timestamp, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 1 + ); // Remove invalid delegate let reader_clone = store_meta.lock().unwrap().readers.get(&1).unwrap().clone(); - assert!(reader.get_delegate(1).is_some()); + assert!(reader.local_reader.get_delegate(1).is_some()); // dropping the non-source `reader` will not make other readers invalid drop(reader_clone); - assert!(reader.get_delegate(1).is_some()); + assert!(reader.local_reader.get_delegate(1).is_some()); // drop the source `reader` store_meta.lock().unwrap().readers.remove(&1).unwrap(); // the invalid delegate should be removed - assert!(reader.get_delegate(1).is_none()); + assert!(reader.local_reader.get_delegate(1).is_none()); } #[test] @@ -1329,23 +1616,24 @@ mod tests { region: Arc::new(region.clone()), peer_id: 1, term: 1, - applied_index_term: 1, + applied_term: 1, leader_lease: None, last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), txn_ext: Arc::new(TxnExt::default()), track_ver: TrackVer::new(), - read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, "".to_owned())), + read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)), pending_remove: false, + wait_data: false, bucket_meta: None, }; meta.readers.insert(1, read_delegate); } - let d = reader.get_delegate(1).unwrap(); + let d = reader.local_reader.get_delegate(1).unwrap(); assert_eq!(&*d.region, ®ion); assert_eq!(d.term, 1); - assert_eq!(d.applied_index_term, 1); + assert_eq!(d.applied_term, 1); assert!(d.leader_lease.is_none()); drop(d); @@ -1357,22 +1645,25 @@ mod tests { .unwrap() .update(Progress::region(region.clone())); } - assert_eq!(&*reader.get_delegate(1).unwrap().region, ®ion); + assert_eq!( + &*reader.local_reader.get_delegate(1).unwrap().region, + ®ion + ); { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(Progress::term(2)); } - assert_eq!(reader.get_delegate(1).unwrap().term, 2); + assert_eq!(reader.local_reader.get_delegate(1).unwrap().term, 2); { let mut meta = store_meta.lock().unwrap(); meta.readers .get_mut(&1) .unwrap() - .update(Progress::applied_index_term(2)); + .update(Progress::applied_term(2)); } - assert_eq!(reader.get_delegate(1).unwrap().applied_index_term, 2); + assert_eq!(reader.local_reader.get_delegate(1).unwrap().applied_term, 2); { let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. @@ -1381,7 +1672,456 @@ mod tests { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(pg); } - let d = reader.get_delegate(1).unwrap(); + let d = reader.local_reader.get_delegate(1).unwrap(); assert_eq!(d.leader_lease.clone().unwrap().term(), 3); } + + #[test] + fn test_read_executor_provider() { + let path = Builder::new() + .prefix("test-local-reader") + .tempdir() + .unwrap(); + let kv_engine = + engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); + let store_meta = + StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(0))), kv_engine.clone()); + + { + let mut meta = store_meta.store_meta.as_ref().lock().unwrap(); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate::mock(1); + meta.readers.insert(1, read_delegate); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate::mock(2); + meta.readers.insert(2, read_delegate); + } + + let (len, delegate) = store_meta.get_executor_and_len(1); + assert_eq!(2, len); + let mut delegate = delegate.unwrap(); + assert_eq!(1, delegate.region.id); + let tablet = delegate.get_tablet(); + assert_eq!(kv_engine.path(), tablet.path()); + + let (len, delegate) = store_meta.get_executor_and_len(2); + assert_eq!(2, len); + let mut delegate = delegate.unwrap(); + assert_eq!(2, delegate.region.id); + let tablet = delegate.get_tablet(); + assert_eq!(kv_engine.path(), tablet.path()); + } + + fn prepare_read_delegate( + store_id: u64, + region_id: u64, + term: u64, + pr_ids: Vec, + region_epoch: RegionEpoch, + store_meta: Arc>, + ) { + let mut region = metapb::Region::default(); + region.set_id(region_id); + let prs = new_peers(store_id, pr_ids); + region.set_peers(prs.clone().into()); + + let leader = prs[0].clone(); + region.set_region_epoch(region_epoch); + let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. + let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, 1)); + + // Register region + lease.renew(monotonic_raw_now()); + let remote = lease.maybe_new_remote_lease(term).unwrap(); + // But the applied_term is stale. + { + let mut meta = store_meta.lock().unwrap(); + let read_delegate = ReadDelegate { + tag: String::new(), + region: Arc::new(region.clone()), + peer_id: leader.get_id(), + term, + applied_term: term, + leader_lease: Some(remote), + last_valid_ts: Timespec::new(0, 0), + txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), + txn_ext: Arc::new(TxnExt::default()), + read_progress, + pending_remove: false, + wait_data: false, + track_ver: TrackVer::new(), + bucket_meta: None, + }; + meta.readers.insert(region_id, read_delegate); + } + } + + #[test] + fn test_snap_across_regions() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1 + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + prepare_read_delegate( + store_id, + 1, + term6, + pr_ids1, + epoch13.clone(), + store_meta.clone(), + ); + let leader1 = prs1[0].clone(); + + // Register region2 + let pr_ids2 = vec![22, 33, 44]; + let prs2 = new_peers(store_id, pr_ids2.clone()); + prepare_read_delegate(store_id, 2, term6, pr_ids2, epoch13.clone(), store_meta); + let leader2 = prs2[0].clone(); + + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13.clone()); + header.set_term(term6); + cmd.set_header(header); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp.snapshot.unwrap()).unwrap(); + })), + ); + + // First request will not hit cache + let read_id = Some(ThreadReadId::new()); + must_not_redirect_with_read_id(&mut reader, &rx, task, read_id.clone()); + let snap1 = snap_rx.recv().unwrap(); + + let mut header = RaftRequestHeader::default(); + header.set_region_id(2); + header.set_peer(leader2); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp.snapshot.unwrap()).unwrap(); + })), + ); + must_not_redirect_with_read_id(&mut reader, &rx, task, read_id); + let snap2 = snap_rx.recv().unwrap(); + assert!(std::ptr::eq(snap1.get_snapshot(), snap2.get_snapshot())); + + // If we use a new read id, the cache will be miss and a new snapshot will be + // generated + let read_id = Some(ThreadReadId::new()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp.snapshot.unwrap()).unwrap(); + })), + ); + must_not_redirect_with_read_id(&mut reader, &rx, task, read_id); + let snap2 = snap_rx.recv().unwrap(); + assert!(!std::ptr::eq(snap1.get_snapshot(), snap2.get_snapshot())); + } + + fn create_engine(path: &str) -> KvTestEngine { + let path = Builder::new().prefix(path).tempdir().unwrap(); + engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap() + } + + #[test] + fn test_snap_cache_context() { + let db = create_engine("test_snap_cache_context"); + let mut snap_cache = SnapCache::new(); + let mut read_context = LocalReadContext::new(&mut snap_cache, None); + + assert!(read_context.snapshot().is_none()); + assert!(read_context.snapshot_ts().is_none()); + + db.put(b"a1", b"val1").unwrap(); + + let compare_ts = monotonic_raw_now(); + // Case 1: snap_cache_context.read_id is None + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.snapshot_ts().unwrap() > compare_ts); + assert_eq!( + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .unwrap(), + b"val1" + ); + + // snap_cache_context is *not* created with read_id, so calling + // `maybe_update_snapshot` again will update the snapshot + let compare_ts = monotonic_raw_now(); + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.snapshot_ts().unwrap() > compare_ts); + + let read_id = ThreadReadId::new(); + let read_id_clone = read_id.clone(); + let mut read_context = LocalReadContext::new(&mut snap_cache, Some(read_id)); + + let compare_ts = monotonic_raw_now(); + // Case 2: snap_cache_context.read_id is not None but not equals to the + // snap_cache.cached_read_id + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.snapshot_ts().unwrap() > compare_ts); + let snap_ts = read_context.snapshot_ts().unwrap(); + assert_eq!( + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .unwrap(), + b"val1" + ); + + let db2 = create_engine("test_snap_cache_context2"); + // snap_cache_context is created with read_id, so calling + // `maybe_update_snapshot` again will *not* update the snapshot + // Case 3: snap_cache_context.read_id is not None and equals to the + // snap_cache.cached_read_id + assert!(!read_context.maybe_update_snapshot(&db2, Timespec::new(0, 0))); + assert_eq!(read_context.snapshot_ts().unwrap(), snap_ts); + assert_eq!( + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .unwrap(), + b"val1" + ); + + // Case 4: delegate.last_valid_ts is larger than create_time of read_id + let mut last_valid_ts = read_id_clone.create_time; + last_valid_ts = last_valid_ts.add(Duration::nanoseconds(1)); + assert!(read_context.maybe_update_snapshot(&db2, last_valid_ts)); + assert!(read_context.snapshot_ts().unwrap() > snap_ts); + assert!( + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .is_none(), + ); + } + + #[test] + fn test_snap_release_for_not_using_cache() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1 + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + prepare_read_delegate(store_id, 1, term6, pr_ids1, epoch13.clone(), store_meta); + let leader1 = prs1[0].clone(); + + // Local read + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // using cache and release + let read_id = ThreadReadId::new(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + must_not_redirect_with_read_id(&mut reader, &rx, task, Some(read_id)); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_some() + ); + reader.release_snapshot_cache(); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_none() + ); + + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + + // not use cache + must_not_redirect_with_read_id(&mut reader, &rx, task, None); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_none() + ); + + // Stale read + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(0).unwrap(); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + header.set_flag_data(data.into()); + + cmd.set_header(header); + let task = RaftCommand::::new( + cmd, + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + let read_id = ThreadReadId::new(); + must_not_redirect_with_read_id(&mut reader, &rx, task, Some(read_id)); + // Stale read will not use snap cache + assert!(reader.snap_cache.snapshot.is_none()); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_none() + ); + } + + #[test] + fn test_stale_read_notify() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1 + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + prepare_read_delegate( + store_id, + 1, + term6, + pr_ids1, + epoch13.clone(), + store_meta.clone(), + ); + let leader1 = prs1[0].clone(); + + // Local read + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13); + header.set_term(term6); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // A peer can serve read_ts < safe_ts. + let safe_ts = TimeStamp::compose(2, 0); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_safe_ts(1, safe_ts.into_inner()); + assert_eq!(delegate.read_progress.safe_ts(), safe_ts.into_inner()); + } + let read_ts_1 = TimeStamp::compose(1, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_1.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // A peer has to notify advancing resolved ts if read_ts >= safe_ts. + let notify = Arc::new(tokio::sync::Notify::new()); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_advance_resolved_ts_notify(notify.clone()); + } + // 201ms larger than safe_ts. + let read_ts_2 = TimeStamp::compose(safe_ts.physical() + 201, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_2.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + let (notify_tx, notify_rx) = channel(); + let (wait_spawn_tx, wait_spawn_rx) = channel(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + let _ = runtime.spawn(async move { + wait_spawn_tx.send(()).unwrap(); + notify.notified().await; + notify_tx.send(()).unwrap(); + }); + wait_spawn_rx.recv().unwrap(); + thread::sleep(std::time::Duration::from_millis(500)); // Prevent lost notify. + must_not_redirect(&mut reader, &rx, task); + notify_rx.recv().unwrap(); + } } diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index 4ad92d5db68..6fcbd6a93e7 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -6,13 +6,20 @@ use std::{ }; use batch_system::{BatchRouter, Fsm, FsmTypes, HandlerBuilder, Poller, PoolState, Priority}; -use file_system::{set_io_type, IOType}; -use tikv_util::{debug, error, info, safe_panic, thd_name, worker::Runnable}; +use file_system::{set_io_type, IoType}; +use tikv_util::{ + debug, error, info, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, worker::Runnable, +}; -use crate::store::fsm::{ - apply::{ApplyFsm, ControlFsm}, - store::StoreFsm, - PeerFsm, +use crate::store::{ + async_io::write::{StoreWriters, StoreWritersContext}, + fsm::{ + apply::{ApplyFsm, ControlFsm}, + store::{RaftRouter, StoreFsm}, + PeerFsm, + }, + transport::Transport, + PersistedNotifier, }; pub struct PoolController> { @@ -39,9 +46,9 @@ where { pub fn decrease_by(&mut self, size: usize) { for _ in 0..size { - if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty) { + if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty, None) { error!( - "failed to decrese thread pool"; + "failed to decrease thread pool"; "decrease to" => size, "err" => %e, ); @@ -70,9 +77,9 @@ where name_prefix, i + self.state.id_base, ))) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); poller.poll(); }) .unwrap(); @@ -108,6 +115,38 @@ where } } +struct WriterContoller +where + EK: engine_traits::KvEngine, + ER: engine_traits::RaftEngine, + T: Transport + 'static, + N: PersistedNotifier, +{ + writer_meta: StoreWritersContext, + store_writers: StoreWriters, + expected_writers_size: usize, +} + +impl WriterContoller +where + EK: engine_traits::KvEngine, + ER: engine_traits::RaftEngine, + T: Transport + 'static, + N: PersistedNotifier, +{ + pub fn new( + writer_meta: StoreWritersContext, + store_writers: StoreWriters, + ) -> Self { + let writers_size = store_writers.size(); + Self { + writer_meta, + store_writers, + expected_writers_size: writers_size, + } + } +} + #[derive(Debug, Clone, Copy)] pub enum BatchComponent { Store, @@ -131,6 +170,7 @@ impl Display for BatchComponent { pub enum Task { ScalePool(BatchComponent, usize), ScaleBatchSize(BatchComponent, usize), + ScaleWriters(usize), } impl Display for Task { @@ -142,38 +182,48 @@ impl Display for Task { Task::ScaleBatchSize(component, size) => { write!(f, "Scale max_batch_size adjusts {}: {} ", component, size) } + Task::ScaleWriters(size) => { + write!(f, "Scale store_io_pool_size adjusts {} ", size) + } } } } -pub struct Runner +pub struct Runner where EK: engine_traits::KvEngine, ER: engine_traits::RaftEngine, AH: HandlerBuilder, ControlFsm>, RH: HandlerBuilder, StoreFsm>, + T: Transport + 'static, { + writer_ctrl: WriterContoller>, apply_pool: PoolController, ControlFsm, AH>, raft_pool: PoolController, StoreFsm, RH>, } -impl Runner +impl Runner where EK: engine_traits::KvEngine, ER: engine_traits::RaftEngine, AH: HandlerBuilder, ControlFsm>, RH: HandlerBuilder, StoreFsm>, + T: Transport + 'static, { pub fn new( + writer_meta: StoreWritersContext>, + store_writers: StoreWriters, apply_router: BatchRouter, ControlFsm>, raft_router: BatchRouter, StoreFsm>, apply_pool_state: PoolState, ControlFsm, AH>, raft_pool_state: PoolState, StoreFsm, RH>, ) -> Self { + let writer_ctrl = WriterContoller::new(writer_meta, store_writers); let apply_pool = PoolController::new(apply_router, apply_pool_state); let raft_pool = PoolController::new(raft_router, raft_pool_state); Runner { + writer_ctrl, apply_pool, raft_pool, } @@ -185,7 +235,7 @@ where match current_pool_size.cmp(&size) { std::cmp::Ordering::Greater => self.raft_pool.decrease_by(current_pool_size - size), std::cmp::Ordering::Less => self.raft_pool.increase_by(size - current_pool_size), - std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Equal => return, } self.raft_pool.cleanup_poller_threads(); info!( @@ -201,7 +251,7 @@ where match current_pool_size.cmp(&size) { std::cmp::Ordering::Greater => self.apply_pool.decrease_by(current_pool_size - size), std::cmp::Ordering::Less => self.apply_pool.increase_by(size - current_pool_size), - std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Equal => return, } self.apply_pool.cleanup_poller_threads(); info!( @@ -210,14 +260,47 @@ where "to" => self.apply_pool.state.expected_pool_size ); } + + /// Resizes the count of background threads in store_writers. + fn resize_store_writers(&mut self, size: usize) { + // The resizing of store writers will not directly update the local cached + // store writers in each poller. Each poller will timely correct its local + // cached in its next `poller.begin()` after the resize operation completed. + let current_size = self.writer_ctrl.expected_writers_size; + self.writer_ctrl.expected_writers_size = size; + match current_size.cmp(&size) { + std::cmp::Ordering::Greater => { + if let Err(e) = self.writer_ctrl.store_writers.decrease_to(size) { + error!("failed to decrease store writers size"; "err_msg" => ?e); + } + } + std::cmp::Ordering::Less => { + let writer_meta = self.writer_ctrl.writer_meta.clone(); + if let Err(e) = self + .writer_ctrl + .store_writers + .increase_to(size, writer_meta) + { + error!("failed to increase store writers size"; "err_msg" => ?e); + } + } + std::cmp::Ordering::Equal => return, + } + info!( + "resize store writers pool"; + "from" => current_size, + "to" => size + ); + } } -impl Runnable for Runner +impl Runnable for Runner where EK: engine_traits::KvEngine, ER: engine_traits::RaftEngine, AH: HandlerBuilder, ControlFsm> + std::marker::Send, RH: HandlerBuilder, StoreFsm> + std::marker::Send, + T: Transport + 'static, { type Task = Task; @@ -235,6 +318,7 @@ where self.apply_pool.state.max_batch_size = size; } }, + Task::ScaleWriters(size) => self.resize_store_writers(size), } } } diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 0ac92103129..0696e70b766 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -4,7 +4,7 @@ use std::{ collections::{ BTreeMap, Bound::{Excluded, Included, Unbounded}, - HashMap, VecDeque, + VecDeque, }, fmt::{self, Display, Formatter}, sync::{ @@ -16,15 +16,18 @@ use std::{ u64, }; +use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Mutable, Range, WriteBatch, CF_LOCK, CF_RAFT}; use fail::fail_point; -use file_system::{IOType, WithIOType}; +use file_system::{IoType, WithIoType}; use kvproto::raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}; use pd_client::PdClient; use raft::eraftpb::Snapshot as RaftSnapshot; use tikv_util::{ - box_err, box_try, defer, error, info, thd_name, - time::Instant, + box_err, box_try, + config::VersionTrack, + defer, error, info, thd_name, + time::{Instant, UnixSecs}, warn, worker::{Runnable, RunnableWithTimer}, }; @@ -44,24 +47,10 @@ use crate::{ }, snap::{plain_file_used, Error, Result, SNAPSHOT_CFS}, transport::CasualRouter, - ApplyOptions, CasualMessage, SnapEntry, SnapKey, SnapManager, + ApplyOptions, CasualMessage, Config, SnapEntry, SnapKey, SnapManager, }, }; -// used to periodically check whether we should delete a stale peer's range in region runner - -#[cfg(test)] -pub const STALE_PEER_CHECK_TICK: usize = 1; // 1000 milliseconds - -#[cfg(not(test))] -pub const STALE_PEER_CHECK_TICK: usize = 10; // 10000 milliseconds - -// used to periodically check whether schedule pending applies in region runner -#[cfg(not(test))] -pub const PENDING_APPLY_CHECK_INTERVAL: u64 = 1_000; // 1000 milliseconds -#[cfg(test)] -pub const PENDING_APPLY_CHECK_INTERVAL: u64 = 200; // 200 milliseconds - const CLEANUP_MAX_REGION_COUNT: usize = 64; const TIFLASH: &str = "tiflash"; @@ -72,7 +61,7 @@ const ENGINE: &str = "engine"; pub enum Task { Gen { region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, kv_snap: S, canceled: Arc, @@ -83,10 +72,12 @@ pub enum Task { Apply { region_id: u64, status: Arc, + peer_id: u64, }, /// Destroy data between [start_key, end_key). /// - /// The deletion may and may not succeed. + /// The actual deletion may be delayed if the engine is overloaded or a + /// reader is still referencing the data. Destroy { region_id: u64, start_key: Vec, @@ -131,13 +122,14 @@ struct StalePeerInfo { pub region_id: u64, pub end_key: Vec, // Once the oldest snapshot sequence exceeds this, it ensures that no one is - // reading on this peer anymore. So we can safely call `delete_files_in_range` - // , which may break the consistency of snapshot, of this peer range. + // reading on this peer anymore. So we can safely call `delete_files_in_range`, + // which may break the consistency of snapshot, of this peer range. pub stale_sequence: u64, } /// A structure records all ranges to be deleted with some delay. -/// The delay is because there may be some coprocessor requests related to these ranges. +/// The delay is because there may be some coprocessor requests related to these +/// ranges. #[derive(Clone, Default)] struct PendingDeleteRanges { ranges: BTreeMap, StalePeerInfo>, // start_key -> StalePeerInfo @@ -202,22 +194,29 @@ impl PendingDeleteRanges { /// Inserts a new range waiting to be deleted. /// - /// Before an insert is called, it must call drain_overlap_ranges to clean the overlapping range. - fn insert(&mut self, region_id: u64, start_key: &[u8], end_key: &[u8], stale_sequence: u64) { - if !self.find_overlap_ranges(start_key, end_key).is_empty() { + /// Before an insert is called, it must call drain_overlap_ranges to clean + /// the overlapping range. + fn insert( + &mut self, + region_id: u64, + start_key: Vec, + end_key: Vec, + stale_sequence: u64, + ) { + if !self.find_overlap_ranges(&start_key, &end_key).is_empty() { panic!( "[region {}] register deleting data in [{}, {}) failed due to overlap", region_id, - log_wrappers::Value::key(start_key), - log_wrappers::Value::key(end_key), + log_wrappers::Value::key(&start_key), + log_wrappers::Value::key(&end_key), ); } let info = StalePeerInfo { region_id, - end_key: end_key.to_owned(), + end_key, stale_sequence, }; - self.ranges.insert(start_key.to_owned(), info); + self.ranges.insert(start_key, info); } /// Gets all stale ranges info. @@ -239,21 +238,14 @@ impl PendingDeleteRanges { } } -#[derive(Clone)] -struct SnapContext -where - EK: KvEngine, -{ +struct SnapGenContext { engine: EK, - batch_size: usize, mgr: SnapManager, - use_delete_range: bool, - pending_delete_ranges: PendingDeleteRanges, - coprocessor_host: CoprocessorHost, router: R, + start: UnixSecs, } -impl SnapContext +impl SnapGenContext where EK: KvEngine, R: CasualRouter, @@ -262,7 +254,7 @@ where fn generate_snap( &self, region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, kv_snap: EK::Snapshot, notifier: SyncSender, @@ -275,10 +267,11 @@ where &self.engine, kv_snap, region_id, - last_applied_index_term, + last_applied_term, last_applied_state, for_balance, allow_multi_files_snapshot, + self.start )); // Only enable the fail point when the region id is equal to 1, which is // the id of bootstrapped region in tests. @@ -290,18 +283,20 @@ where "err" => %e, ); } - // The error can be ignored as snapshot will be sent in next heartbeat in the end. + // The error can be ignored as snapshot will be sent in next heartbeat in the + // end. let _ = self .router .send(region_id, CasualMessage::SnapshotGenerated); Ok(()) } - /// Handles the task of generating snapshot of the Region. It calls `generate_snap` to do the actual work. + /// Handles the task of generating snapshot of the Region. It calls + /// `generate_snap` to do the actual work. fn handle_gen( &self, region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, kv_snap: EK::Snapshot, canceled: Arc, @@ -310,22 +305,23 @@ where allow_multi_files_snapshot: bool, ) { fail_point!("before_region_gen_snap", |_| ()); - SNAP_COUNTER.generate.all.inc(); + SNAP_COUNTER.generate.start.inc(); if canceled.load(Ordering::Relaxed) { info!("generate snap is canceled"; "region_id" => region_id); + SNAP_COUNTER.generate.abort.inc(); return; } let start = Instant::now(); - let _io_type_guard = WithIOType::new(if for_balance { - IOType::LoadBalance + let _io_type_guard = WithIoType::new(if for_balance { + IoType::LoadBalance } else { - IOType::Replication + IoType::Replication }); if let Err(e) = self.generate_snap( region_id, - last_applied_index_term, + last_applied_term, last_applied_state, kv_snap, notifier, @@ -333,6 +329,7 @@ where allow_multi_files_snapshot, ) { error!(%e; "failed to generate snap!!!"; "region_id" => region_id,); + SNAP_COUNTER.generate.fail.inc(); return; } @@ -341,14 +338,80 @@ where .generate .observe(start.saturating_elapsed_secs()); } +} - /// Applies snapshot data of the Region. - fn apply_snap(&mut self, region_id: u64, abort: Arc) -> Result<()> { - info!("begin apply snap data"; "region_id" => region_id); - fail_point!("region_apply_snap", |_| { Ok(()) }); - check_abort(&abort)?; +pub struct Runner +where + EK: KvEngine, + T: PdClient + 'static, +{ + batch_size: usize, + use_delete_range: bool, + clean_stale_tick: usize, + clean_stale_check_interval: Duration, + clean_stale_ranges_tick: usize, + + tiflash_stores: HashMap, + // we may delay some apply tasks if level 0 files to write stall threshold, + // pending_applies records all delayed apply task, and will check again later + pending_applies: VecDeque>, + // Ranges that have been logically destroyed at a specific sequence number. We can + // assume there will be no reader (engine snapshot) newer than that sequence number. Therefore, + // they can be physically deleted with `DeleteFiles` when we're sure there is no older + // reader as well. + // To protect this assumption, before a new snapshot is applied, the overlapping pending ranges + // must first be removed. + // The sole purpose of maintaining this list is to optimize deletion with `DeleteFiles` + // whenever we can. Errors while processing them can be ignored. + pending_delete_ranges: PendingDeleteRanges, + + engine: EK, + mgr: SnapManager, + coprocessor_host: CoprocessorHost, + router: R, + pd_client: Option>, + pool: ThreadPool, +} + +impl Runner +where + EK: KvEngine, + R: CasualRouter, + T: PdClient + 'static, +{ + pub fn new( + engine: EK, + mgr: SnapManager, + cfg: Arc>, + coprocessor_host: CoprocessorHost, + router: R, + pd_client: Option>, + ) -> Runner { + Runner { + batch_size: cfg.value().snap_apply_batch_size.0 as usize, + use_delete_range: cfg.value().use_delete_range, + clean_stale_tick: 0, + clean_stale_check_interval: Duration::from_millis( + cfg.value().region_worker_tick_interval.as_millis(), + ), + clean_stale_ranges_tick: cfg.value().clean_stale_ranges_tick, + tiflash_stores: HashMap::default(), + pending_applies: VecDeque::new(), + pending_delete_ranges: PendingDeleteRanges::default(), + engine, + mgr, + coprocessor_host, + router, + pd_client, + pool: Builder::new(thd_name!("snap-generator")) + .max_thread_count(cfg.value().snap_generator_pool_size) + .build_future_pool(), + } + } + + fn region_state(&self, region_id: u64) -> Result { let region_key = keys::region_state_key(region_id); - let mut region_state: RegionLocalState = + let region_state: RegionLocalState = match box_try!(self.engine.get_msg_cf(CF_RAFT, ®ion_key)) { Some(state) => state, None => { @@ -358,36 +421,41 @@ where )); } }; + Ok(region_state) + } - // clear up origin data. - let region = region_state.get_region().clone(); - let start_key = keys::enc_start_key(®ion); - let end_key = keys::enc_end_key(®ion); - check_abort(&abort)?; - let overlap_ranges = self - .pending_delete_ranges - .drain_overlap_ranges(&start_key, &end_key); - if !overlap_ranges.is_empty() { - CLEAN_COUNTER_VEC - .with_label_values(&["overlap-with-apply"]) - .inc(); - self.cleanup_overlap_regions(overlap_ranges)?; - } - self.delete_all_in_range(&[Range::new(&start_key, &end_key)])?; - check_abort(&abort)?; - fail_point!("apply_snap_cleanup_range"); - + fn apply_state(&self, region_id: u64) -> Result { let state_key = keys::apply_state_key(region_id); let apply_state: RaftApplyState = match box_try!(self.engine.get_msg_cf(CF_RAFT, &state_key)) { Some(state) => state, None => { return Err(box_err!( - "failed to get raftstate from {}", + "failed to get apply_state from {}", log_wrappers::Value::key(&state_key) )); } }; + Ok(apply_state) + } + + /// Applies snapshot data of the Region. + fn apply_snap(&mut self, region_id: u64, peer_id: u64, abort: Arc) -> Result<()> { + info!("begin apply snap data"; "region_id" => region_id, "peer_id" => peer_id); + fail_point!("region_apply_snap", |_| { Ok(()) }); + check_abort(&abort)?; + + let mut region_state = self.region_state(region_id)?; + let region = region_state.get_region().clone(); + let start_key = keys::enc_start_key(®ion); + let end_key = keys::enc_end_key(®ion); + check_abort(&abort)?; + self.clean_overlap_ranges(start_key, end_key)?; + check_abort(&abort)?; + fail_point!("apply_snap_cleanup_range"); + + // apply snapshot + let apply_state = self.apply_state(region_id)?; let term = apply_state.get_truncated_state().get_term(); let idx = apply_state.get_truncated_state().get_index(); let snap_key = SnapKey::new(region_id, term, idx); @@ -403,16 +471,19 @@ where let timer = Instant::now(); let options = ApplyOptions { db: self.engine.clone(), - region, + region: region.clone(), abort: Arc::clone(&abort), write_batch_size: self.batch_size, coprocessor_host: self.coprocessor_host.clone(), }; s.apply(options)?; + self.coprocessor_host + .post_apply_snapshot(®ion, peer_id, &snap_key, Some(&s)); + // delete snapshot state. let mut wb = self.engine.write_batch(); region_state.set_state(PeerState::Normal); - box_try!(wb.put_msg_cf(CF_RAFT, ®ion_key, ®ion_state)); + box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)); box_try!(wb.delete_cf(CF_RAFT, &keys::snapshot_raft_state_key(region_id))); wb.write().unwrap_or_else(|e| { panic!("{} failed to save apply_snap result: {:?}", region_id, e); @@ -425,20 +496,20 @@ where Ok(()) } - /// Tries to apply the snapshot of the specified Region. It calls `apply_snap` to do the actual work. - fn handle_apply(&mut self, region_id: u64, status: Arc) { + /// Tries to apply the snapshot of the specified Region. It calls + /// `apply_snap` to do the actual work. + fn handle_apply(&mut self, region_id: u64, peer_id: u64, status: Arc) { let _ = status.compare_exchange( JOB_STATUS_PENDING, JOB_STATUS_RUNNING, Ordering::SeqCst, Ordering::SeqCst, ); - SNAP_COUNTER.apply.all.inc(); - // let apply_histogram = SNAP_HISTOGRAM.with_label_values(&["apply"]); - // let timer = apply_histogram.start_coarse_timer(); + SNAP_COUNTER.apply.start.inc(); + let start = Instant::now(); - match self.apply_snap(region_id, Arc::clone(&status)) { + match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { Ok(()) => { status.swap(JOB_STATUS_FINISHED, Ordering::SeqCst); SNAP_COUNTER.apply.success.inc(); @@ -464,79 +535,77 @@ where let _ = self.router.send(region_id, CasualMessage::SnapshotApplied); } - /// Cleans up the data within the range. - fn cleanup_range(&self, ranges: &[Range<'_>]) -> Result<()> { - self.engine - .delete_all_in_range(DeleteStrategy::DeleteFiles, ranges) - .unwrap_or_else(|e| { - error!("failed to delete files in range"; "err" => %e); - }); - self.delete_all_in_range(ranges)?; - self.engine - .delete_all_in_range(DeleteStrategy::DeleteBlobs, ranges) - .unwrap_or_else(|e| { - error!("failed to delete files in range"; "err" => %e); - }); - Ok(()) - } - - /// Gets the overlapping ranges and cleans them up. - fn cleanup_overlap_regions( + /// Tries to clean up files in pending ranges overlapping with the given + /// bounds. These pending ranges will be removed. Returns an updated range + /// that also includes these ranges. Caller must ensure the remaining keys + /// in the returning range will be deleted properly. + fn clean_overlap_ranges_roughly( &mut self, - overlap_ranges: Vec<(u64, Vec, Vec, u64)>, - ) -> Result<()> { + mut start_key: Vec, + mut end_key: Vec, + ) -> (Vec, Vec) { + let overlap_ranges = self + .pending_delete_ranges + .drain_overlap_ranges(&start_key, &end_key); + if overlap_ranges.is_empty() { + return (start_key, end_key); + } + CLEAN_COUNTER_VEC.with_label_values(&["overlap"]).inc(); let oldest_sequence = self .engine .get_oldest_snapshot_sequence_number() .unwrap_or(u64::MAX); - let mut ranges = Vec::with_capacity(overlap_ranges.len()); - let mut df_ranges = Vec::with_capacity(overlap_ranges.len()); - for (region_id, start_key, end_key, stale_sequence) in overlap_ranges.iter() { - // `DeleteFiles` may break current rocksdb snapshots consistency, - // so do not use it unless we can make sure there is no reader of the destroyed peer anymore. - if *stale_sequence < oldest_sequence { - df_ranges.push(Range::new(start_key, end_key)); - } else { - SNAP_COUNTER_VEC - .with_label_values(&["overlap", "not_delete_files"]) - .inc(); - } - info!("delete data in range because of overlap"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key)); - ranges.push(Range::new(start_key, end_key)); - } + let df_ranges: Vec<_> = overlap_ranges + .iter() + .filter_map(|(region_id, cur_start, cur_end, stale_sequence)| { + info!( + "delete data in range because of overlap"; "region_id" => region_id, + "start_key" => log_wrappers::Value::key(cur_start), + "end_key" => log_wrappers::Value::key(cur_end) + ); + if &start_key > cur_start { + start_key = cur_start.clone(); + } + if &end_key < cur_end { + end_key = cur_end.clone(); + } + if *stale_sequence < oldest_sequence { + Some(Range::new(cur_start, cur_end)) + } else { + SNAP_COUNTER_VEC + .with_label_values(&["overlap", "not_delete_files"]) + .inc(); + None + } + }) + .collect(); self.engine - .delete_all_in_range(DeleteStrategy::DeleteFiles, &df_ranges) + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &df_ranges) .unwrap_or_else(|e| { error!("failed to delete files in range"; "err" => %e); }); + (start_key, end_key) + } - self.delete_all_in_range(&ranges) + /// Cleans up data in the given range and all pending ranges overlapping + /// with it. + fn clean_overlap_ranges(&mut self, start_key: Vec, end_key: Vec) -> Result<()> { + let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); + self.delete_all_in_range(&[Range::new(&start_key, &end_key)]) } /// Inserts a new pending range, and it will be cleaned up with some delay. - fn insert_pending_delete_range(&mut self, region_id: u64, start_key: &[u8], end_key: &[u8]) { - let overlap_ranges = self - .pending_delete_ranges - .drain_overlap_ranges(start_key, end_key); - if !overlap_ranges.is_empty() { - CLEAN_COUNTER_VEC - .with_label_values(&["overlap-with-destroy"]) - .inc(); - if let Err(e) = self.cleanup_overlap_regions(overlap_ranges) { - warn!("cleanup_overlap_ranges failed"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key), - "err" => %e, - ); - } - } + fn insert_pending_delete_range( + &mut self, + region_id: u64, + start_key: Vec, + end_key: Vec, + ) { + let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); info!("register deleting data in range"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key), + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), ); let seq = self.engine.get_latest_sequence_number(); self.pending_delete_ranges @@ -553,33 +622,43 @@ where .engine .get_oldest_snapshot_sequence_number() .unwrap_or(u64::MAX); - let mut cleanup_ranges: Vec<(u64, Vec, Vec)> = self + let mut region_ranges: Vec<(u64, Vec, Vec)> = self .pending_delete_ranges .stale_ranges(oldest_sequence) .map(|(region_id, s, e)| (region_id, s.to_vec(), e.to_vec())) .collect(); - if cleanup_ranges.is_empty() { + if region_ranges.is_empty() { return; } CLEAN_COUNTER_VEC.with_label_values(&["destroy"]).inc_by(1); - cleanup_ranges.sort_by(|a, b| a.1.cmp(&b.1)); - while cleanup_ranges.len() > CLEANUP_MAX_REGION_COUNT { - cleanup_ranges.pop(); - } - let ranges: Vec> = cleanup_ranges + region_ranges.sort_by(|a, b| a.1.cmp(&b.1)); + region_ranges.truncate(CLEANUP_MAX_REGION_COUNT); + let ranges: Vec<_> = region_ranges .iter() .map(|(region_id, start, end)| { info!("delete data in range because of stale"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start), - "end_key" => log_wrappers::Value::key(end)); + "start_key" => log_wrappers::Value::key(start), + "end_key" => log_wrappers::Value::key(end)); Range::new(start, end) }) .collect(); - if let Err(e) = self.cleanup_range(&ranges) { + + self.engine + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &ranges) + .unwrap_or_else(|e| { + error!("failed to delete files in range"; "err" => %e); + }); + if let Err(e) = self.delete_all_in_range(&ranges) { error!("failed to cleanup stale range"; "err" => %e); return; } - for (_, key, _) in cleanup_ranges { + self.engine + .delete_ranges_cfs(DeleteStrategy::DeleteBlobs, &ranges) + .unwrap_or_else(|e| { + error!("failed to delete blobs in range"; "err" => %e); + }); + + for (_, key, _) in region_ranges { assert!( self.pending_delete_ranges.remove(&key).is_some(), "cleanup pending_delete_ranges {} should exist", @@ -588,8 +667,8 @@ where } } - /// Checks the number of files at level 0 to avoid write stall after ingesting sst. - /// Returns true if the ingestion causes write stall. + /// Checks the number of files at level 0 to avoid write stall after + /// ingesting sst. Returns true if the ingestion causes write stall. fn ingest_maybe_stall(&self) -> bool { for cf in SNAPSHOT_CFS { // no need to check lock cf @@ -620,72 +699,76 @@ where Ok(()) } -} -pub struct Runner -where - EK: KvEngine, - T: PdClient + 'static, -{ - pool: ThreadPool, - ctx: SnapContext, - // we may delay some apply tasks if level 0 files to write stall threshold, - // pending_applies records all delayed apply task, and will check again later - pending_applies: VecDeque>, - clean_stale_tick: usize, - clean_stale_check_interval: Duration, - tiflash_stores: HashMap, - pd_client: Option>, -} + /// Calls observer `pre_apply_snapshot` for every task. + /// Multiple task can be `pre_apply_snapshot` at the same time. + fn pre_apply_snapshot(&self, task: &Task) -> Result<()> { + let (region_id, abort, peer_id) = match task { + Task::Apply { + region_id, + status, + peer_id, + } => (region_id, status.clone(), peer_id), + _ => panic!("invalid apply snapshot task"), + }; -impl Runner -where - EK: KvEngine, - R: CasualRouter, - T: PdClient + 'static, -{ - pub fn new( - engine: EK, - mgr: SnapManager, - batch_size: usize, - use_delete_range: bool, - snap_generator_pool_size: usize, - coprocessor_host: CoprocessorHost, - router: R, - pd_client: Option>, - ) -> Runner { - Runner { - pool: Builder::new(thd_name!("snap-generator")) - .max_thread_count(snap_generator_pool_size) - .build_future_pool(), - ctx: SnapContext { - engine, - mgr, - batch_size, - use_delete_range, - pending_delete_ranges: PendingDeleteRanges::default(), - coprocessor_host, - router, - }, - pending_applies: VecDeque::new(), - clean_stale_tick: 0, - clean_stale_check_interval: Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL), - tiflash_stores: HashMap::default(), - pd_client, + let region_state = self.region_state(*region_id)?; + let apply_state = self.apply_state(*region_id)?; + + check_abort(&abort)?; + + let term = apply_state.get_truncated_state().get_term(); + let idx = apply_state.get_truncated_state().get_index(); + let snap_key = SnapKey::new(*region_id, term, idx); + let s = box_try!(self.mgr.get_snapshot_for_applying(&snap_key)); + if !s.exists() { + self.coprocessor_host.pre_apply_snapshot( + region_state.get_region(), + *peer_id, + &snap_key, + None, + ); + return Err(box_err!("missing snapshot file {}", s.path())); } + check_abort(&abort)?; + self.coprocessor_host.pre_apply_snapshot( + region_state.get_region(), + *peer_id, + &snap_key, + Some(&s), + ); + Ok(()) } /// Tries to apply pending tasks if there is some. - fn handle_pending_applies(&mut self) { + fn handle_pending_applies(&mut self, is_timeout: bool) { fail_point!("apply_pending_snapshot", |_| {}); + let mut new_batch = true; while !self.pending_applies.is_empty() { - // should not handle too many applies than the number of files that can be ingested. - // check level 0 every time because we can not make sure how does the number of level 0 files change. - if self.ctx.ingest_maybe_stall() { + // should not handle too many applies than the number of files that can be + // ingested. check level 0 every time because we can not make sure + // how does the number of level 0 files change. + if self.ingest_maybe_stall() { break; } - if let Some(Task::Apply { region_id, status }) = self.pending_applies.pop_front() { - self.ctx.handle_apply(region_id, status); + if let Some(Task::Apply { region_id, .. }) = self.pending_applies.front() { + fail_point!("handle_new_pending_applies", |_| {}); + if !self + .engine + .can_apply_snapshot(is_timeout, new_batch, *region_id) + { + // KvEngine can't apply snapshot for other reasons. + break; + } + if let Some(Task::Apply { + region_id, + status, + peer_id, + }) = self.pending_applies.pop_front() + { + new_batch = false; + self.handle_apply(region_id, peer_id, status); + } } } } @@ -703,7 +786,7 @@ where match task { Task::Gen { region_id, - last_applied_index_term, + last_applied_term, last_applied_state, kv_snap, canceled, @@ -713,7 +796,6 @@ where } => { // It is safe for now to handle generating and applying snapshot concurrently, // but it may not when merge is implemented. - let ctx = self.ctx.clone(); let mut allow_multi_files_snapshot = false; // if to_store_id is 0, it means the to_store_id cannot be found if to_store_id != 0 { @@ -722,14 +804,10 @@ where } else { let is_tiflash = self.pd_client.as_ref().map_or(false, |pd_client| { if let Ok(s) = pd_client.get_store(to_store_id) { - if let Some(_l) = s.get_labels().iter().find(|l| { - l.key.to_lowercase() == ENGINE - && l.value.to_lowercase() == TIFLASH - }) { - return true; - } else { - return false; - } + return s.get_labels().iter().any(|label| { + label.get_key().to_lowercase() == ENGINE + && label.get_value().to_lowercase() == TIFLASH + }); } true }); @@ -737,12 +815,18 @@ where allow_multi_files_snapshot = !is_tiflash; } } - + SNAP_COUNTER.generate.all.inc(); + let ctx = SnapGenContext { + engine: self.engine.clone(), + mgr: self.mgr.clone(), + router: self.router.clone(), + start: UnixSecs::now(), + }; self.pool.spawn(async move { tikv_alloc::add_thread_memory_accessor(); ctx.handle_gen( region_id, - last_applied_index_term, + last_applied_term, last_applied_state, kv_snap, canceled, @@ -755,9 +839,13 @@ where } task @ Task::Apply { .. } => { fail_point!("on_region_worker_apply", true, |_| {}); + if self.coprocessor_host.should_pre_apply_snapshot() { + let _ = self.pre_apply_snapshot(&task); + } + SNAP_COUNTER.apply.all.inc(); // to makes sure applying snapshots in order. self.pending_applies.push_back(task); - self.handle_pending_applies(); + self.handle_pending_applies(false); if !self.pending_applies.is_empty() { // delay the apply and retry later SNAP_COUNTER.apply.delay.inc() @@ -771,9 +859,8 @@ where fail_point!("on_region_worker_destroy", true, |_| {}); // try to delay the range deletion because // there might be a coprocessor request related to this range - self.ctx - .insert_pending_delete_range(region_id, &start_key, &end_key); - self.ctx.clean_stale_ranges(); + self.insert_pending_delete_range(region_id, start_key, end_key); + self.clean_stale_ranges(); } } } @@ -790,10 +877,10 @@ where T: PdClient + 'static, { fn on_timeout(&mut self) { - self.handle_pending_applies(); + self.handle_pending_applies(true); self.clean_stale_tick += 1; - if self.clean_stale_tick >= STALE_PEER_CHECK_TICK { - self.ctx.clean_stale_ranges(); + if self.clean_stale_tick >= self.clean_stale_ranges_tick { + self.clean_stale_ranges(); self.clean_stale_tick = 0; } } @@ -804,7 +891,7 @@ where } #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::{ io, sync::{atomic::AtomicUsize, mpsc, Arc}, @@ -813,28 +900,48 @@ mod tests { }; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions}, + ctor::CfOptions, kv::{KvTestEngine, KvTestSnapshot}, }; use engine_traits::{ CompactExt, FlowControlFactorsExt, KvEngine, MiscExt, Mutable, Peekable, - RaftEngineReadOnly, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, + RaftEngineReadOnly, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_WRITE, }; use keys::data_key; - use kvproto::raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}; + use kvproto::raft_serverpb::{PeerState, RaftApplyState, RaftSnapshotData, RegionLocalState}; use pd_client::RpcClient; + use protobuf::Message; use tempfile::Builder; - use tikv_util::worker::{LazyWorker, Worker}; + use tikv_util::{ + config::{ReadableDuration, ReadableSize}, + worker::{LazyWorker, Worker}, + }; use super::*; use crate::{ - coprocessor::CoprocessorHost, + coprocessor::{ + ApplySnapshotObserver, BoxApplySnapshotObserver, Coprocessor, CoprocessorHost, + ObserverContext, + }, store::{ peer_storage::JOB_STATUS_PENDING, snap::tests::get_test_db_for_regions, worker::RegionRunner, CasualMessage, SnapKey, SnapManager, }, }; + const PENDING_APPLY_CHECK_INTERVAL: Duration = Duration::from_millis(200); + const STALE_PEER_CHECK_TICK: usize = 1; + + pub fn make_raftstore_cfg(use_delete_range: bool) -> Arc> { + let mut store_cfg = Config::default(); + store_cfg.snap_apply_batch_size = ReadableSize(0); + store_cfg.region_worker_tick_interval = ReadableDuration(PENDING_APPLY_CHECK_INTERVAL); + store_cfg.clean_stale_ranges_tick = STALE_PEER_CHECK_TICK; + store_cfg.use_delete_range = use_delete_range; + store_cfg.snap_generator_pool_size = 2; + Arc::new(VersionTrack::new(store_cfg)) + } + fn insert_range( pending_delete_ranges: &mut PendingDeleteRanges, id: u64, @@ -842,7 +949,12 @@ mod tests { e: &str, stale_sequence: u64, ) { - pending_delete_ranges.insert(id, s.as_bytes(), e.as_bytes(), stale_sequence); + pending_delete_ranges.insert( + id, + s.as_bytes().to_owned(), + e.as_bytes().to_owned(), + stale_sequence, + ); } #[test] @@ -925,12 +1037,11 @@ mod tests { let mut worker: LazyWorker> = bg_worker.lazy_build("region-worker"); let sched = worker.scheduler(); let (router, _) = mpsc::sync_channel(11); + let cfg = make_raftstore_cfg(false); let mut runner = RegionRunner::new( engine.kv.clone(), mgr, - 0, - false, - 2, + cfg, CoprocessorHost::::default(), router, Option::>::None, @@ -981,15 +1092,19 @@ mod tests { .prefix("test_pending_applies") .tempdir() .unwrap(); + let obs = MockApplySnapshotObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_apply_snapshot_observer(1, BoxApplySnapshotObserver::new(obs.clone())); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_slowdown_writes_trigger(5); cf_opts.set_disable_auto_compactions(true); let kv_cfs_opts = vec![ - CFOptions::new("default", cf_opts.clone()), - CFOptions::new("write", cf_opts.clone()), - CFOptions::new("lock", cf_opts.clone()), - CFOptions::new("raft", cf_opts.clone()), + (CF_DEFAULT, cf_opts.clone()), + (CF_WRITE, cf_opts.clone()), + (CF_LOCK, cf_opts.clone()), + (CF_RAFT, cf_opts.clone()), ]; let engine = get_test_db_for_regions( &temp_dir, @@ -1025,17 +1140,17 @@ mod tests { let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let bg_worker = Worker::new("snap-manager"); let mut worker = bg_worker.lazy_build("snap-manager"); let sched = worker.scheduler(); let (router, receiver) = mpsc::sync_channel(1); + let cfg = make_raftstore_cfg(true); let runner = RegionRunner::new( engine.kv.clone(), mgr, - 0, - true, - 2, - CoprocessorHost::::default(), + cfg, + host, router, Option::>::None, ); @@ -1055,7 +1170,7 @@ mod tests { .schedule(Task::Gen { region_id: id, kv_snap: engine.kv.snapshot(), - last_applied_index_term: entry.get_term(), + last_applied_term: entry.get_term(), last_applied_state: apply_state, canceled: Arc::new(AtomicBool::new(false)), notifier: tx, @@ -1070,11 +1185,14 @@ mod tests { } msg => panic!("expected SnapshotGenerated, but got {:?}", msg), } - let data = s1.get_data(); + let mut data = RaftSnapshotData::default(); + data.merge_from_bytes(s1.get_data()).unwrap(); let key = SnapKey::from_snap(&s1).unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); let mut s2 = mgr.get_snapshot_for_sending(&key).unwrap(); - let mut s3 = mgr.get_snapshot_for_receiving(&key, data).unwrap(); + let mut s3 = mgr + .get_snapshot_for_receiving(&key, data.take_meta()) + .unwrap(); io::copy(&mut s2, &mut s3).unwrap(); s3.save().unwrap(); @@ -1096,6 +1214,7 @@ mod tests { .schedule(Task::Apply { region_id: id, status, + peer_id: 1, }) .unwrap(); }; @@ -1139,6 +1258,22 @@ mod tests { } }; + #[allow(dead_code)] + let must_not_finish = |ids: &[u64]| { + for id in ids { + let region_key = keys::region_state_key(*id); + assert_eq!( + engine + .kv + .get_msg_cf::(CF_RAFT, ®ion_key) + .unwrap() + .unwrap() + .get_state(), + PeerState::Applying + ) + } + }; + // snapshot will not ingest cause already write stall gen_and_apply_snap(1); assert_eq!( @@ -1162,6 +1297,12 @@ mod tests { ); wait_apply_finish(&[1]); + assert_eq!(obs.pre_apply_count.load(Ordering::SeqCst), 1); + assert_eq!(obs.post_apply_count.load(Ordering::SeqCst), 1); + assert_eq!( + obs.pre_apply_hash.load(Ordering::SeqCst), + obs.post_apply_hash.load(Ordering::SeqCst) + ); // the pending apply task should be finished and snapshots are ingested. // note that when ingest sst, it may flush memtable if overlap, @@ -1208,7 +1349,7 @@ mod tests { ); gen_and_apply_snap(5); destroy_region(6); - thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); assert!(check_region_exist(6)); assert_eq!( engine @@ -1265,7 +1406,67 @@ mod tests { .unwrap(), 2 ); - thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); assert!(!check_region_exist(6)); + + #[cfg(feature = "failpoints")] + { + engine.kv.compact_files_in_range(None, None, None).unwrap(); + fail::cfg("handle_new_pending_applies", "return").unwrap(); + gen_and_apply_snap(7); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); + must_not_finish(&[7]); + fail::remove("handle_new_pending_applies"); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); + wait_apply_finish(&[7]); + } + bg_worker.stop(); + // Wait the timer fired. Otherwise deletion of directory may race with timer + // task. + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); + } + + #[derive(Clone, Default)] + struct MockApplySnapshotObserver { + pub pre_apply_count: Arc, + pub post_apply_count: Arc, + pub pre_apply_hash: Arc, + pub post_apply_hash: Arc, + } + + impl Coprocessor for MockApplySnapshotObserver {} + + impl ApplySnapshotObserver for MockApplySnapshotObserver { + fn pre_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + peer_id: u64, + key: &crate::store::SnapKey, + snapshot: Option<&crate::store::Snapshot>, + ) { + let code = + snapshot.unwrap().total_size() + key.term + key.region_id + key.idx + peer_id; + self.pre_apply_count.fetch_add(1, Ordering::SeqCst); + self.pre_apply_hash + .fetch_add(code as usize, Ordering::SeqCst); + } + + fn post_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + peer_id: u64, + key: &crate::store::SnapKey, + snapshot: Option<&crate::store::Snapshot>, + ) { + let code = + snapshot.unwrap().total_size() + key.term + key.region_id + key.idx + peer_id; + self.post_apply_count.fetch_add(1, Ordering::SeqCst); + self.post_apply_hash + .fetch_add(code as usize, Ordering::SeqCst); + } + + fn should_pre_apply_snapshot(&self) -> bool { + true + } } } diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index ecb2d43f566..1335ed5d5e8 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -7,25 +7,27 @@ use std::{ mem, }; -use engine_traits::{CfName, IterOptions, Iterable, Iterator, KvEngine, CF_WRITE, LARGE_CFS}; -use file_system::{IOType, WithIOType}; -use itertools::Itertools; -use kvproto::{ - metapb::{Region, RegionEpoch}, - pdpb::CheckPolicy, +use engine_traits::{ + CfName, IterOptions, Iterable, Iterator, KvEngine, TabletRegistry, CF_WRITE, LARGE_CFS, }; +use file_system::{IoType, WithIoType}; +use itertools::Itertools; +use kvproto::{metapb::Region, pdpb::CheckPolicy}; use online_config::{ConfigChange, OnlineConfig}; -use tikv_util::{box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable}; +use tikv_util::{ + box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable, Either, +}; +use txn_types::Key; use super::metrics::*; #[cfg(any(test, feature = "testexport"))] use crate::coprocessor::Config; use crate::{ coprocessor::{ + dispatcher::StoreHandle, split_observer::{is_valid_split_key, strip_timestamp_if_exists}, CoprocessorHost, SplitCheckerHost, }, - store::{Callback, CasualMessage, CasualRouter}, Result, }; @@ -97,14 +99,14 @@ where Some(KeyBuilder::from_slice(end_key, 0, 0)), fill_cache, ); - let mut iter = db.iterator_cf_opt(cf, iter_opt)?; - let found: Result = iter.seek(start_key.into()).map_err(|e| box_err!(e)); + let mut iter = db.iterator_opt(cf, iter_opt)?; + let found: Result = iter.seek(start_key).map_err(|e| box_err!(e)); if found? { heap.push(KeyEntry::new( iter.key().to_vec(), pos, iter.value().len(), - *cf, + cf, )); } iters.push((*cf, iter)); @@ -130,10 +132,10 @@ where } } -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct BucketRange(pub Vec, pub Vec); -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct Bucket { // new proposed split keys under the bucket for split // if it does not need split, it's empty @@ -145,6 +147,8 @@ pub struct Bucket { pub enum Task { SplitCheckTask { region: Region, + start_key: Option>, + end_key: Option>, auto_split: bool, policy: CheckPolicy, bucket_ranges: Option>, @@ -164,6 +168,26 @@ impl Task { ) -> Task { Task::SplitCheckTask { region, + start_key: None, + end_key: None, + auto_split, + policy, + bucket_ranges, + } + } + + pub fn split_check_key_range( + region: Region, + start_key: Option>, + end_key: Option>, + auto_split: bool, + policy: CheckPolicy, + bucket_ranges: Option>, + ) -> Task { + Task::SplitCheckTask { + region, + start_key, + end_key, auto_split, policy, bucket_ranges, @@ -175,11 +199,17 @@ impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { Task::SplitCheckTask { - region, auto_split, .. + region, + start_key, + end_key, + auto_split, + .. } => write!( f, - "[split check worker] Split Check Task for {}, auto_split: {:?}", + "[split check worker] Split Check Task for {}, start_key: {:?}, end_key: {:?}, auto_split: {:?}", region.get_id(), + start_key, + end_key, auto_split ), Task::ChangeConfig(_) => write!(f, "[split check worker] Change Config Task"), @@ -190,23 +220,30 @@ impl Display for Task { } } -pub struct Runner -where - E: KvEngine, -{ - engine: E, +pub struct Runner { + // We can't just use `TabletRegistry` here, otherwise v1 may create many + // invalid records and cause other problems. + engine: Either>, router: S, - coprocessor: CoprocessorHost, + coprocessor: CoprocessorHost, } -impl Runner -where - E: KvEngine, - S: CasualRouter, -{ - pub fn new(engine: E, router: S, coprocessor: CoprocessorHost) -> Runner { +impl Runner { + pub fn new(engine: EK, router: S, coprocessor: CoprocessorHost) -> Runner { Runner { - engine, + engine: Either::Left(engine), + router, + coprocessor, + } + } + + pub fn with_registry( + registry: TabletRegistry, + router: S, + coprocessor: CoprocessorHost, + ) -> Runner { + Runner { + engine: Either::Right(registry), router, coprocessor, } @@ -214,8 +251,9 @@ where fn approximate_check_bucket( &self, + tablet: &EK, region: &Region, - host: &mut SplitCheckerHost<'_, E>, + host: &mut SplitCheckerHost<'_, EK>, bucket_ranges: Option>, ) -> Result<()> { let ranges = bucket_ranges.clone().unwrap_or_else(|| { @@ -229,7 +267,7 @@ where let mut bucket = region.clone(); bucket.set_start_key(range.0.clone()); bucket.set_end_key(range.1.clone()); - let bucket_entry = host.approximate_bucket_keys(&bucket, &self.engine)?; + let bucket_entry = host.approximate_bucket_keys(&bucket, tablet)?; debug!( "bucket_entry size {} keys count {}", bucket_entry.size, @@ -299,64 +337,110 @@ where region: &Region, bucket_ranges: Option>, ) { - let _ = self.router.send( + self.router.refresh_region_buckets( region.get_id(), - CasualMessage::RefreshRegionBuckets { - region_epoch: region.get_region_epoch().clone(), - buckets, - bucket_ranges, - cb: Callback::None, - }, + region.get_region_epoch().clone(), + buckets, + bucket_ranges, ); } - /// Checks a Region with split and bucket checkers to produce split keys and buckets keys and generates split admin command. + /// Checks a Region with split and bucket checkers to produce split keys and + /// buckets keys and generates split admin command. fn check_split_and_bucket( &mut self, region: &Region, + start_key: Option>, + end_key: Option>, auto_split: bool, policy: CheckPolicy, bucket_ranges: Option>, ) { + let mut cached; + let tablet = match &self.engine { + Either::Left(e) => e, + Either::Right(r) => match r.get(region.get_id()) { + Some(c) => { + cached = Some(c); + match cached.as_mut().unwrap().latest() { + Some(t) => t, + None => return, + } + } + None => return, + }, + }; let region_id = region.get_id(); - let start_key = keys::enc_start_key(region); - let end_key = keys::enc_end_key(region); + let is_key_range = start_key.is_some() && end_key.is_some(); + let start_key = if is_key_range { + // This key is usually from a request, which should be encoded first. + keys::data_key(Key::from_raw(&start_key.unwrap()).as_encoded().as_slice()) + } else { + keys::enc_start_key(region) + }; + let end_key = if is_key_range { + keys::data_end_key(Key::from_raw(&end_key.unwrap()).as_encoded().as_slice()) + } else { + keys::enc_end_key(region) + }; debug!( "executing task"; "region_id" => region_id, + "is_key_range" => is_key_range, "start_key" => log_wrappers::Value::key(&start_key), "end_key" => log_wrappers::Value::key(&end_key), "policy" => ?policy, ); CHECK_SPILT_COUNTER.all.inc(); - let mut host = - self.coprocessor - .new_split_checker_host(region, &self.engine, auto_split, policy); + let mut host = self + .coprocessor + .new_split_checker_host(region, tablet, auto_split, policy); if host.skip() { - debug!("skip split check"; "region_id" => region.get_id()); + debug!("skip split check"; + "region_id" => region.get_id(), + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); return; } let split_keys = match host.policy() { CheckPolicy::Scan => { - match self.scan_split_keys(&mut host, region, &start_key, &end_key, bucket_ranges) { + match self.scan_split_keys( + &mut host, + tablet, + region, + is_key_range, + &start_key, + &end_key, + bucket_ranges, + ) { Ok(keys) => keys, Err(e) => { - error!(%e; "failed to scan split key"; "region_id" => region_id,); + error!(%e; "failed to scan split key"; + "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); return; } } } - CheckPolicy::Approximate => match host.approximate_split_keys(region, &self.engine) { + CheckPolicy::Approximate => match host.approximate_split_keys(region, tablet) { Ok(keys) => { if host.enable_region_bucket() { if let Err(e) = - self.approximate_check_bucket(region, &mut host, bucket_ranges) + self.approximate_check_bucket(tablet, region, &mut host, bucket_ranges) { error!(%e; "approximate_check_bucket failed"; "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), ); } } @@ -368,17 +452,27 @@ where error!(%e; "failed to get approximate split key, try scan way"; "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), ); match self.scan_split_keys( &mut host, + tablet, region, + is_key_range, &start_key, &end_key, bucket_ranges, ) { Ok(keys) => keys, Err(e) => { - error!(%e; "failed to scan split key"; "region_id" => region_id,); + error!(%e; "failed to scan split key"; + "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); return; } } @@ -389,12 +483,8 @@ where if !split_keys.is_empty() { let region_epoch = region.get_region_epoch().clone(); - let msg = new_split_region(region_epoch, split_keys, "split checker"); - let res = self.router.send(region_id, msg); - if let Err(e) = res { - warn!("failed to send check result"; "region_id" => region_id, "err" => %e); - } - + self.router + .ask_split(region_id, region_epoch, split_keys, "split checker".into()); CHECK_SPILT_COUNTER.success.inc(); } else { debug!( @@ -408,12 +498,14 @@ where /// Gets the split keys by scanning the range. /// bucket_ranges: specify the ranges to generate buckets. - /// If none, gengerate buckets for the whole region. + /// If none, generate buckets for the whole region. /// If it's Some(vec![]), skip generating buckets. fn scan_split_keys( &self, - host: &mut SplitCheckerHost<'_, E>, + host: &mut SplitCheckerHost<'_, EK>, + tablet: &EK, region: &Region, + is_key_range: bool, start_key: &[u8], end_key: &[u8], bucket_ranges: Option>, @@ -432,12 +524,8 @@ where (!host.enable_region_bucket(), &empty_bucket) }; - MergedIterator::<::Iterator>::new( - &self.engine, - LARGE_CFS, - start_key, - end_key, - false, + MergedIterator::<::Iterator>::new( + tablet, LARGE_CFS, start_key, end_key, false, ) .map(|mut iter| { let mut size = 0; @@ -481,7 +569,8 @@ where if bucket_range_idx == bucket_range_list.len() { skip_check_bucket = true; } else if origin_key >= bucket_range_list[bucket_range_idx].0.as_slice() { - // e.key() is between bucket_range_list[bucket_range_idx].0, bucket_range_list[bucket_range_idx].1 + // e.key() is between bucket_range_list[bucket_range_idx].0, + // bucket_range_list[bucket_range_idx].1 bucket_size += e.entry_size() as u64; if bucket_size >= host.region_bucket_size() { bucket.keys.push(origin_key.to_vec()); @@ -508,7 +597,11 @@ where } } - // if we scan the whole range, we can update approximate size and keys with accurate value. + // if we scan the whole range, we can update approximate size and keys with + // accurate value. + if is_key_range { + return; + } info!( "update approximate size and keys with accurate value"; "region_id" => region.get_id(), @@ -517,14 +610,8 @@ where "bucket_count" => buckets.len(), "bucket_size" => bucket_size, ); - let _ = self.router.send( - region.get_id(), - CasualMessage::RegionApproximateSize { size }, - ); - let _ = self.router.send( - region.get_id(), - CasualMessage::RegionApproximateKeys { keys }, - ); + self.router.update_approximate_size(region.get_id(), size); + self.router.update_approximate_keys(region.get_id(), keys); })?; if host.enable_region_bucket() { @@ -543,39 +630,66 @@ where } fn change_cfg(&mut self, change: ConfigChange) { + if let Err(e) = self.coprocessor.cfg.update(change.clone()) { + error!("update split check config failed"; "err" => ?e); + return; + }; info!( "split check config updated"; "change" => ?change ); - self.coprocessor.cfg.update(change); } } -impl Runnable for Runner +impl Runnable for Runner where - E: KvEngine, - S: CasualRouter, + EK: KvEngine, + S: StoreHandle, { type Task = Task; fn run(&mut self, task: Task) { - let _io_type_guard = WithIOType::new(IOType::LoadBalance); + let _io_type_guard = WithIoType::new(IoType::LoadBalance); match task { Task::SplitCheckTask { region, + start_key, + end_key, + auto_split, + policy, + bucket_ranges, + } => self.check_split_and_bucket( + ®ion, + start_key, + end_key, auto_split, policy, bucket_ranges, - } => self.check_split_and_bucket(®ion, auto_split, policy, bucket_ranges), + ), Task::ChangeConfig(c) => self.change_cfg(c), Task::ApproximateBuckets(region) => { - if self.coprocessor.cfg.enable_region_bucket { + if self.coprocessor.cfg.enable_region_bucket() { + let mut cached; + let tablet = match &self.engine { + Either::Left(e) => e, + Either::Right(r) => match r.get(region.get_id()) { + Some(c) => { + cached = Some(c); + match cached.as_mut().unwrap().latest() { + Some(t) => t, + None => return, + } + } + None => return, + }, + }; let mut host = self.coprocessor.new_split_checker_host( ®ion, - &self.engine, + tablet, false, CheckPolicy::Approximate, ); - if let Err(e) = self.approximate_check_bucket(®ion, &mut host, None) { + if let Err(e) = self.approximate_check_bucket(tablet, ®ion, &mut host, None) + { error!(%e; "approximate_check_bucket failed"; "region_id" => region.get_id(), @@ -588,19 +702,3 @@ where } } } - -fn new_split_region( - region_epoch: RegionEpoch, - split_keys: Vec>, - source: &'static str, -) -> CasualMessage -where - E: KvEngine, -{ - CasualMessage::SplitRegion { - region_epoch, - split_keys, - callback: Callback::None, - source: source.into(), - } -} diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index da7f137765a..8fec853bb00 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -6,19 +6,51 @@ use lazy_static::lazy_static; use online_config::{ConfigChange, ConfigManager, OnlineConfig}; use parking_lot::Mutex; use serde::{Deserialize, Serialize}; -use tikv_util::{config::VersionTrack, info}; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + info, +}; const DEFAULT_DETECT_TIMES: u64 = 10; const DEFAULT_SAMPLE_THRESHOLD: u64 = 100; pub(crate) const DEFAULT_SAMPLE_NUM: usize = 20; -const DEFAULT_QPS_THRESHOLD: usize = 3000; -const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; - -// We get balance score by abs(sample.left-sample.right)/(sample.right+sample.left). It will be used to measure left and right balance +pub const DEFAULT_QPS_THRESHOLD: usize = 3000; +pub const DEFAULT_BIG_REGION_QPS_THRESHOLD: usize = 7000; +pub const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; +pub const DEFAULT_BIG_REGION_BYTE_THRESHOLD: usize = 100 * 1024 * 1024; + +// We get balance score by +// abs(sample.left-sample.right)/(sample.right+sample.left). It will be used to +// measure left and right balance const DEFAULT_SPLIT_BALANCE_SCORE: f64 = 0.25; -// We get contained score by sample.contained/(sample.right+sample.left+sample.contained). It will be used to avoid to split regions requested by range. +// We get contained score by +// sample.contained/(sample.right+sample.left+sample.contained). It will be used +// to avoid to split regions requested by range. const DEFAULT_SPLIT_CONTAINED_SCORE: f64 = 0.5; +// If the `split_balance_score` and `split_contained_score` above could not be +// satisfied, we will try to split the region according to its CPU load, +// then these parameters below will start to work. +// When the gRPC poll thread CPU usage (over the past `detect_times` seconds by +// default) is higher than gRPC poll thread count * +// `DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, the CPU-based split won't +// be triggered no matter if the +// `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO` and +// `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` are exceeded to prevent from increasing +// the gRPC poll CPU usage. +const DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.5; +// When the Unified Read Poll thread CPU usage is higher than Unified Read Poll +// thread count * +// `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, +// the CPU-based split will try to check and record the top hot CPU region. +const DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.8; +// When the Unified Read Poll is hot and the region's CPU usage reaches +// `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` as a percentage of the Unified Read +// Poll, it will be added into the hot region list and may be split later as the +// top hot CPU region. +pub const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; +pub const BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.75; + lazy_static! { static ref SPLIT_CONFIG: Mutex>>> = Mutex::new(None); } @@ -43,6 +75,11 @@ pub struct SplitConfig { pub sample_num: usize, pub sample_threshold: u64, pub byte_threshold: usize, + #[doc(hidden)] + pub grpc_thread_cpu_overload_threshold_ratio: f64, + #[doc(hidden)] + pub unified_read_pool_thread_cpu_overload_threshold_ratio: f64, + pub region_cpu_overload_threshold_ratio: f64, // deprecated. #[online_config(skip)] #[doc(hidden)] @@ -65,6 +102,11 @@ impl Default for SplitConfig { sample_num: DEFAULT_SAMPLE_NUM, sample_threshold: DEFAULT_SAMPLE_THRESHOLD, byte_threshold: DEFAULT_BYTE_THRESHOLD, + grpc_thread_cpu_overload_threshold_ratio: + DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, + unified_read_pool_thread_cpu_overload_threshold_ratio: + DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, + region_cpu_overload_threshold_ratio: REGION_CPU_OVERLOAD_THRESHOLD_RATIO, size_threshold: None, // deprecated. key_threshold: None, // deprecated. } @@ -87,8 +129,26 @@ impl SplitConfig { ("sample_num should be less than qps_threshold for load-base-split.").into(), ); } + if self.grpc_thread_cpu_overload_threshold_ratio > 1.0 + || self.grpc_thread_cpu_overload_threshold_ratio < 0.0 + || self.unified_read_pool_thread_cpu_overload_threshold_ratio > 1.0 + || self.unified_read_pool_thread_cpu_overload_threshold_ratio < 0.0 + || self.region_cpu_overload_threshold_ratio > 1.0 + || self.region_cpu_overload_threshold_ratio < 0.0 + { + return Err(("threshold ratio should be between 0 and 1.").into()); + } Ok(()) } + + pub fn optimize_for(&mut self, region_size: ReadableSize) { + const LARGE_REGION_SIZE_IN_MB: u64 = 4096; + if region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB { + self.qps_threshold = DEFAULT_BIG_REGION_QPS_THRESHOLD; + self.region_cpu_overload_threshold_ratio = BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO; + self.byte_threshold = DEFAULT_BIG_REGION_BYTE_THRESHOLD; + } + } } #[derive(Clone)] @@ -117,7 +177,7 @@ impl ConfigManager for SplitConfigManager { { let change = change.clone(); self.0 - .update(move |cfg: &mut SplitConfig| cfg.update(change)); + .update(move |cfg: &mut SplitConfig| cfg.update(change))?; } info!( "load base split config changed"; diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index d21c97285d0..6d556d1c283 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -4,7 +4,7 @@ use std::{ cmp::{min, Ordering}, collections::{BinaryHeap, HashMap, HashSet}, slice::{Iter, IterMut}, - sync::Arc, + sync::{mpsc::Receiver, Arc}, time::{Duration, SystemTime}, }; @@ -13,40 +13,26 @@ use kvproto::{ metapb::{self, Peer}, pdpb::QueryKind, }; -use pd_client::{merge_bucket_stats, new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{BucketMeta, BucketStat}; use rand::Rng; -use tikv_util::{config::Tracker, debug, info, warn}; +use resource_metering::RawRecords; +use tikv_util::{ + config::Tracker, + debug, info, + metrics::ThreadInfoStatistics, + store::{is_read_query, QueryStats}, + warn, +}; use crate::store::{ metrics::*, - worker::{ - query_stats::{is_read_query, QueryStats}, - split_config::get_sample_num, - FlowStatistics, SplitConfig, SplitConfigManager, - }, + util::build_key_range, + worker::{split_config::get_sample_num, FlowStatistics, SplitConfig, SplitConfigManager}, }; const DEFAULT_MAX_SAMPLE_LOOP_COUNT: usize = 10000; pub const TOP_N: usize = 10; -// LOAD_BASE_SPLIT_EVENT metrics label definitions. -// Workload fits the QPS threshold or byte threshold. -const LOAD_FIT: &str = "load_fit"; -// The statistical key is empty. -const EMPTY_STATISTICAL_KEY: &str = "empty_statistical_key"; -// Split info has been collected, ready to split. -const READY_TO_SPLIT: &str = "ready_to_split"; -// Split info has not been collected yet, not ready to split. -const NOT_READY_TO_SPLIT: &str = "not_ready_to_split"; -// The number of sampled keys does not meet the threshold. -const NO_ENOUGH_SAMPLED_KEY: &str = "no_enough_sampled_key"; -// The number of sampled keys located on left and right does not meet the threshold. -const NO_ENOUGH_LR_KEY: &str = "no_enough_lr_key"; -// The number of balanced keys does not meet the score. -const NO_BALANCE_KEY: &str = "no_balance_key"; -// The number of contained keys does not meet the score. -const NO_UNCROSS_KEY: &str = "no_uncross_key"; - // It will return prefix sum of the given iter, // `read` is a function to process the item from the iter. #[inline(always)] @@ -76,7 +62,8 @@ where } // This function uses the distributed/parallel reservoir sampling algorithm. -// It will sample min(sample_num, all_key_ranges_num) key ranges from multiple `key_ranges_provider` with the same possibility. +// It will sample min(sample_num, all_key_ranges_num) key ranges from multiple +// `key_ranges_provider` with the same possibility. fn sample( sample_num: usize, mut key_ranges_providers: Vec, @@ -88,7 +75,8 @@ where let mut sampled_key_ranges = vec![]; // Retain the non-empty key ranges. // `key_ranges_provider` may return an empty key ranges vector, which will cause - // the later sampling to fall into a dead loop. So we need to filter it out here. + // the later sampling to fall into a dead loop. So we need to filter it out + // here. key_ranges_providers .retain_mut(|key_ranges_provider| !key_ranges_getter(key_ranges_provider).is_empty()); if key_ranges_providers.is_empty() { @@ -125,8 +113,9 @@ where // Generate a random number in [1, all_key_ranges_num]. // Starting from 1 is to achieve equal probability. // For example, for a `prefix_sum` like [1, 2, 3, 4], - // if we generate a random number in [0, 4], the probability of choosing the first index is 0.4 - // rather than 0.25 due to that 0 and 1 will both make `binary_search` get the same result. + // if we generate a random number in [0, 4], the probability of choosing the + // first index is 0.4 rather than 0.25 due to that 0 and 1 will both + // make `binary_search` get the same result. let i = prefix_sum .binary_search(&rng.gen_range(1..=all_key_ranges_num)) .unwrap_or_else(|i| i); @@ -186,7 +175,8 @@ impl From> for Samples { } impl Samples { - // evaluate the samples according to the given key range, it will update the sample's left, right and contained counter. + // evaluate the samples according to the given key range, it will update the + // sample's left, right and contained counter. fn evaluate(&mut self, key_range: &KeyRange) { for mut sample in self.0.iter_mut() { let order_start = if key_range.start_key.is_empty() { @@ -221,42 +211,39 @@ impl Samples { } let evaluated_key_num_lr = sample.left + sample.right; if evaluated_key_num_lr == 0 { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_ENOUGH_LR_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.no_enough_lr_key.inc(); continue; } let evaluated_key_num = (sample.contained + evaluated_key_num_lr) as f64; - // The balance score is the difference in the number of requested keys between the left and right of a sample key. - // The smaller the balance score, the more balanced the load will be after this splitting. + // The balance score is the difference in the number of requested keys between + // the left and right of a sample key. The smaller the balance + // score, the more balanced the load will be after this splitting. let balance_score = (sample.left as f64 - sample.right as f64).abs() / evaluated_key_num_lr as f64; LOAD_BASE_SPLIT_SAMPLE_VEC .with_label_values(&["balance_score"]) .observe(balance_score); if balance_score >= split_balance_score { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_BALANCE_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.no_balance_key.inc(); continue; } - // The contained score is the ratio of a sample key that are contained in the requested key. - // The larger the contained score, the more RPCs the cluster will receive after this splitting. + // The contained score is the ratio of a sample key that are contained in the + // requested key. The larger the contained score, the more RPCs the + // cluster will receive after this splitting. let contained_score = sample.contained as f64 / evaluated_key_num; LOAD_BASE_SPLIT_SAMPLE_VEC .with_label_values(&["contained_score"]) .observe(contained_score); if contained_score >= split_contained_score { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_UNCROSS_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.no_uncross_key.inc(); continue; } - // We try to find a split key that has the smallest balance score and the smallest contained score - // to make the splitting keep the load balanced while not increasing too many RPCs. + // We try to find a split key that has the smallest balance score and the + // smallest contained score to make the splitting keep the load + // balanced while not increasing too many RPCs. let final_score = balance_score + contained_score; if final_score < best_score { best_index = index as i32; @@ -266,7 +253,7 @@ impl Samples { if best_index >= 0 { return self.0[best_index as usize].key.clone(); } - return vec![]; + vec![] } } @@ -277,6 +264,8 @@ pub struct Recorder { pub peer: Peer, pub key_ranges: Vec>, pub create_time: SystemTime, + pub cpu_usage: f64, + pub hottest_key_range: Option, } impl Recorder { @@ -286,6 +275,8 @@ impl Recorder { peer: Peer::default(), key_ranges: vec![], create_time: SystemTime::now(), + cpu_usage: 0.0, + hottest_key_range: None, } } @@ -299,22 +290,31 @@ impl Recorder { } } + fn update_cpu_usage(&mut self, cpu_usage: f64) { + self.cpu_usage = cpu_usage; + } + + fn update_hottest_key_range(&mut self, key_range: KeyRange) { + self.hottest_key_range = Some(key_range); + } + fn is_ready(&self) -> bool { self.key_ranges.len() >= self.detect_times } // collect the split keys from the recorded key_ranges. // This will start a second-level sampling on the previous sampled key ranges, - // evaluate the samples according to the given key range, and compute the split keys finally. + // evaluate the samples according to the given key range, and compute the split + // keys finally. fn collect(&self, config: &SplitConfig) -> Vec { let sampled_key_ranges = sample(config.sample_num, self.key_ranges.clone(), |x| x); let mut samples = Samples::from(sampled_key_ranges); let recorded_key_ranges: Vec<&KeyRange> = self.key_ranges.iter().flatten().collect(); - // Because we need to observe the number of `no_enough_key` of all the actual keys, - // so we do this check after the samples are calculated. + // Because we need to observe the number of `no_enough_key` of all the actual + // keys, so we do this check after the samples are calculated. if (recorded_key_ranges.len() as u64) < config.sample_threshold { LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_ENOUGH_SAMPLED_KEY]) + .no_enough_sampled_key .inc_by(samples.0.len() as u64); return vec![]; } @@ -325,8 +325,8 @@ impl Recorder { } } -// RegionInfo will maintain key_ranges with sample_num length by reservoir sampling. -// And it will save qps num and peer. +// RegionInfo will maintain key_ranges with sample_num length by reservoir +// sampling. And it will save qps num and peer. #[derive(Debug, Clone)] pub struct RegionInfo { pub sample_num: usize, @@ -361,7 +361,7 @@ impl RegionInfo { if n == 0 || self.key_ranges.len() < self.sample_num { self.key_ranges.push(key_range); } else { - let j = rand::thread_rng().gen_range(0..n) as usize; + let j = rand::thread_rng().gen_range(0..n); if j < self.sample_num { self.key_ranges[j] = key_range; } @@ -388,7 +388,8 @@ pub struct ReadStats { // 2. add_query_num_batch // 3. add_flow // Among these three methods, `add_flow` will not update `key_ranges` of `RegionInfo`, - // and due to this, an `RegionInfo` without `key_ranges` may occur. The caller should be aware of this. + // and due to this, an `RegionInfo` without `key_ranges` may occur. The caller should be aware + // of this. pub region_infos: HashMap, pub sample_num: usize, pub region_buckets: HashMap, @@ -450,30 +451,22 @@ impl ReadStats { region_info.flow.add(write); region_info.flow.add(data); if let Some(buckets) = buckets { - let bucket_stat = self.region_buckets.entry(region_id).or_insert_with(|| { - let stats = new_bucket_stats(buckets); - BucketStat::new(buckets.clone(), stats) - }); - if bucket_stat.meta < *buckets { - let stats = new_bucket_stats(buckets); - let mut new = BucketStat::new(buckets.clone(), stats); - merge_bucket_stats( - &new.meta.keys, - &mut new.stats, - &bucket_stat.meta.keys, - &bucket_stat.stats, - ); - *bucket_stat = new; - } + let bucket_stat = self + .region_buckets + .entry(region_id) + .and_modify(|current| { + if current.meta < *buckets { + let mut new = BucketStat::from_meta(buckets.clone()); + std::mem::swap(current, &mut new); + current.merge(&new); + } + }) + .or_insert_with(|| BucketStat::from_meta(buckets.clone())); let mut delta = metapb::BucketStats::default(); delta.set_read_bytes(vec![(write.read_bytes + data.read_bytes) as u64]); delta.set_read_keys(vec![(write.read_keys + data.read_keys) as u64]); - let start = start.unwrap_or_default(); - let end = end.unwrap_or_default(); - merge_bucket_stats( - &bucket_stat.meta.keys, - &mut bucket_stat.stats, - &[start, end], + bucket_stat.add_flows( + &[start.unwrap_or_default(), end.unwrap_or_default()], &delta, ); } @@ -515,34 +508,147 @@ impl WriteStats { pub struct SplitInfo { pub region_id: u64, - pub split_key: Vec, pub peer: Peer, + pub split_key: Option>, + pub start_key: Option>, + pub end_key: Option>, +} + +impl SplitInfo { + // Create a SplitInfo with the given region_id, peer and split_key. + // This is used to split the region with this specified split key later. + fn with_split_key(region_id: u64, peer: Peer, split_key: Vec) -> Self { + SplitInfo { + region_id, + peer, + split_key: Some(split_key), + start_key: None, + end_key: None, + } + } + + // Create a SplitInfo with the given region_id, peer, start_key and end_key. + // This is used to split the region on half within the specified start and end + // keys later. + fn with_start_end_key( + region_id: u64, + peer: Peer, + start_key: Vec, + end_key: Vec, + ) -> Self { + SplitInfo { + region_id, + peer, + split_key: None, + start_key: Some(start_key), + end_key: Some(end_key), + } + } +} + +#[derive(PartialEq, Debug)] +pub enum SplitConfigChange { + Noop, + UpdateRegionCpuCollector(bool), } pub struct AutoSplitController { // RegionID -> Recorder pub recorders: HashMap, - cfg: SplitConfig, + pub cfg: SplitConfig, cfg_tracker: Tracker, + // Thread-related info + max_grpc_thread_count: usize, + max_unified_read_pool_thread_count: usize, + unified_read_pool_scale_receiver: Option>, + grpc_thread_usage_vec: Vec, } impl AutoSplitController { - pub fn new(config_manager: SplitConfigManager) -> AutoSplitController { + pub fn new( + config_manager: SplitConfigManager, + max_grpc_thread_count: usize, + max_unified_read_pool_thread_count: usize, + unified_read_pool_scale_receiver: Option>, + ) -> AutoSplitController { AutoSplitController { recorders: HashMap::default(), cfg: config_manager.value().clone(), cfg_tracker: config_manager.0.clone().tracker("split_hub".to_owned()), + max_grpc_thread_count, + max_unified_read_pool_thread_count, + unified_read_pool_scale_receiver, + grpc_thread_usage_vec: vec![], } } pub fn default() -> AutoSplitController { - AutoSplitController::new(SplitConfigManager::default()) + AutoSplitController::new(SplitConfigManager::default(), 0, 0, None) + } + + fn update_grpc_thread_usage(&mut self, grpc_thread_usage: f64) { + self.grpc_thread_usage_vec.push(grpc_thread_usage); + let length = self.grpc_thread_usage_vec.len(); + let detect_times = self.cfg.detect_times as usize; + // Only keep the last `self.cfg.detect_times` elements. + if length > detect_times { + self.grpc_thread_usage_vec.drain(..length - detect_times); + } + } + + fn get_avg_grpc_thread_usage(&self) -> f64 { + let length = self.grpc_thread_usage_vec.len(); + if length == 0 { + return 0.0; + } + let sum = self.grpc_thread_usage_vec.iter().sum::(); + sum / length as f64 + } + + fn should_check_region_cpu(&self) -> bool { + self.cfg.region_cpu_overload_threshold_ratio > 0.0 + } + + fn is_grpc_poll_busy(&self, avg_grpc_thread_usage: f64) -> bool { + fail::fail_point!("mock_grpc_poll_is_not_busy", |_| { false }); + if self.max_grpc_thread_count == 0 { + return false; + } + if self.cfg.grpc_thread_cpu_overload_threshold_ratio <= 0.0 { + return true; + } + avg_grpc_thread_usage + >= self.max_grpc_thread_count as f64 * self.cfg.grpc_thread_cpu_overload_threshold_ratio } - // collect the read stats from read_stats_vec and dispatch them to a region hashmap. + fn is_unified_read_pool_busy(&self, unified_read_pool_thread_usage: f64) -> bool { + fail::fail_point!("mock_unified_read_pool_is_busy", |_| { true }); + if self.max_unified_read_pool_thread_count == 0 { + return false; + } + let unified_read_pool_cpu_overload_threshold = self.max_unified_read_pool_thread_count + as f64 + * self + .cfg + .unified_read_pool_thread_cpu_overload_threshold_ratio; + unified_read_pool_thread_usage > 0.0 + && unified_read_pool_thread_usage >= unified_read_pool_cpu_overload_threshold + } + + fn is_region_busy(&self, unified_read_pool_thread_usage: f64, region_cpu_usage: f64) -> bool { + fail::fail_point!("mock_region_is_busy", |_| { true }); + if unified_read_pool_thread_usage <= 0.0 || !self.should_check_region_cpu() { + return false; + } + region_cpu_usage / unified_read_pool_thread_usage + >= self.cfg.region_cpu_overload_threshold_ratio + } + + // collect the read stats from read_stats_vec and dispatch them to a Region + // HashMap. fn collect_read_stats(read_stats_vec: Vec) -> HashMap> { - // collect from different thread - let mut region_infos_map = HashMap::default(); // regionID-regionInfos + // RegionID -> Vec, collect the RegionInfo from different threads. + let mut region_infos_map = HashMap::default(); let capacity = read_stats_vec.len(); for read_stats in read_stats_vec { for (region_id, region_info) in read_stats.region_infos { @@ -555,13 +661,115 @@ impl AutoSplitController { region_infos_map } - // flush the read stats info into the recorder and check if the region needs to be split - // according to all the stats info the recorder has collected before. - pub fn flush(&mut self, read_stats_vec: Vec) -> (Vec, Vec) { - let mut split_infos = vec![]; + // collect the CPU stats from cpu_stats_vec and dispatch them to a Region + // HashMap. + fn collect_cpu_stats( + &self, + cpu_stats_vec: Vec>, + ) -> HashMap)> { + // RegionID -> (CPU usage, Hottest Key Range), calculate the CPU usage and its + // hottest key range. + let mut region_cpu_map = HashMap::default(); + if !self.should_check_region_cpu() { + return region_cpu_map; + } + // Calculate the Region CPU usage. + let mut collect_interval_ms = 0; + let mut region_key_range_cpu_time_map = HashMap::new(); + cpu_stats_vec.iter().for_each(|cpu_stats| { + cpu_stats.records.iter().for_each(|(tag, record)| { + // Calculate the Region ID -> CPU Time. + region_cpu_map + .entry(tag.region_id) + .and_modify(|(cpu_time, _)| *cpu_time += record.cpu_time as f64) + .or_insert_with(|| (record.cpu_time as f64, None)); + // Calculate the (Region ID, Key Range) -> CPU Time. + tag.key_ranges.iter().for_each(|key_range| { + region_key_range_cpu_time_map + .entry((tag.region_id, key_range)) + .and_modify(|cpu_time| *cpu_time += record.cpu_time) + .or_insert_with(|| record.cpu_time); + }) + }); + collect_interval_ms += cpu_stats.duration.as_millis(); + }); + // Calculate the Region CPU usage. + region_cpu_map.iter_mut().for_each(|(_, (cpu_time, _))| { + if collect_interval_ms == 0 { + *cpu_time = 0.0; + } else { + *cpu_time /= collect_interval_ms as f64; + } + }); + // Choose the hottest key range for each Region. + let mut hottest_key_range_cpu_time_map = HashMap::with_capacity(region_cpu_map.len()); + region_key_range_cpu_time_map + .iter() + .for_each(|((region_id, key_range), cpu_time)| { + let hottest_key_range_cpu_time = hottest_key_range_cpu_time_map + .entry(*region_id) + .or_insert_with(|| 0); + if cpu_time > hottest_key_range_cpu_time { + region_cpu_map + .entry(*region_id) + .and_modify(|(_, old_key_range)| { + *old_key_range = + Some(build_key_range(&key_range.0, &key_range.1, false)); + }); + *hottest_key_range_cpu_time = *cpu_time; + } + }); + region_cpu_map + } + + fn collect_thread_usage(thread_stats: &ThreadInfoStatistics, name: &str) -> f64 { + thread_stats + .get_cpu_usages() + .iter() + .filter(|(thread_name, _)| thread_name.contains(name)) + .fold(0, |cpu_usage_sum, (_, cpu_usage)| { + // `cpu_usage` is in [0, 100]. + cpu_usage_sum + cpu_usage + }) as f64 + / 100.0 + } + + // flush the read stats info into the recorder and check if the region needs to + // be split according to all the stats info the recorder has collected before. + pub fn flush( + &mut self, + read_stats_vec: Vec, + cpu_stats_vec: Vec>, + thread_stats: &ThreadInfoStatistics, + ) -> (Vec, Vec) { + let mut top_cpu_usage = vec![]; let mut top_qps = BinaryHeap::with_capacity(TOP_N); let region_infos_map = Self::collect_read_stats(read_stats_vec); + let region_cpu_map = self.collect_cpu_stats(cpu_stats_vec); + // Prepare some diagnostic info. + let (grpc_thread_usage, unified_read_pool_thread_usage) = ( + Self::collect_thread_usage(thread_stats, "grpc-server"), + Self::collect_thread_usage(thread_stats, "unified-read-po"), + ); + // Update first before calculating the latest average gRPC poll CPU usage. + self.update_grpc_thread_usage(grpc_thread_usage); + let avg_grpc_thread_usage = self.get_avg_grpc_thread_usage(); + let (is_grpc_poll_busy, is_unified_read_pool_busy) = ( + self.is_grpc_poll_busy(avg_grpc_thread_usage), + self.is_unified_read_pool_busy(unified_read_pool_thread_usage), + ); + debug!("flush to load base split"; + "max_grpc_thread_count" => self.max_grpc_thread_count, + "grpc_thread_usage" => grpc_thread_usage, + "avg_grpc_thread_usage" => avg_grpc_thread_usage, + "max_unified_read_pool_thread_count" => self.max_unified_read_pool_thread_count, + "unified_read_pool_thread_usage" => unified_read_pool_thread_usage, + "is_grpc_poll_busy" => is_grpc_poll_busy, + "is_unified_read_pool_busy" => is_unified_read_pool_busy, + ); + // Start to record the read stats info. + let mut split_infos = vec![]; for (region_id, region_infos) in region_infos_map { let qps_prefix_sum = prefix_sum(region_infos.iter(), RegionInfo::get_read_qps); // region_infos is not empty, so it's safe to unwrap here. @@ -569,24 +777,36 @@ impl AutoSplitController { let byte = region_infos .iter() .fold(0, |flow, region_info| flow + region_info.flow.read_bytes); + let (cpu_usage, hottest_key_range) = region_cpu_map + .get(®ion_id) + .map(|(cpu_usage, key_range)| (*cpu_usage, key_range.clone())) + .unwrap_or((0.0, None)); + let is_region_busy = self.is_region_busy(unified_read_pool_thread_usage, cpu_usage); debug!("load base split params"; "region_id" => region_id, "qps" => qps, "qps_threshold" => self.cfg.qps_threshold, "byte" => byte, "byte_threshold" => self.cfg.byte_threshold, + "cpu_usage" => cpu_usage, + "is_region_busy" => is_region_busy, ); QUERY_REGION_VEC .with_label_values(&["read"]) .observe(qps as f64); - if qps < self.cfg.qps_threshold && byte < self.cfg.byte_threshold { + // 1. If the QPS or the byte does not meet the threshold, skip. + // 2. If the Unified Read Pool or the region is not hot enough, skip. + if qps < self.cfg.qps_threshold + && byte < self.cfg.byte_threshold + && (!is_unified_read_pool_busy || !is_region_busy) + { self.recorders.remove_entry(®ion_id); continue; } - LOAD_BASE_SPLIT_EVENT.with_label_values(&[LOAD_FIT]).inc(); + LOAD_BASE_SPLIT_EVENT.load_fit.inc(); let detect_times = self.cfg.detect_times; let recorder = self @@ -594,6 +814,10 @@ impl AutoSplitController { .entry(region_id) .or_insert_with(|| Recorder::new(detect_times)); recorder.update_peer(®ion_infos[0].peer); + recorder.update_cpu_usage(cpu_usage); + if let Some(hottest_key_range) = hottest_key_range { + recorder.update_hottest_key_range(hottest_key_range); + } let key_ranges = sample( self.cfg.sample_num, @@ -601,38 +825,81 @@ impl AutoSplitController { RegionInfo::get_key_ranges_mut, ); if key_ranges.is_empty() { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[EMPTY_STATISTICAL_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.empty_statistical_key.inc(); continue; } recorder.record(key_ranges); if recorder.is_ready() { let key = recorder.collect(&self.cfg); if !key.is_empty() { - split_infos.push(SplitInfo { + split_infos.push(SplitInfo::with_split_key( region_id, - split_key: key, - peer: recorder.peer.clone(), - }); - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[READY_TO_SPLIT]) - .inc(); + recorder.peer.clone(), + key, + )); + LOAD_BASE_SPLIT_EVENT.ready_to_split.inc(); info!("load base split region"; "region_id" => region_id, "qps" => qps, + "byte" => byte, + "cpu_usage" => cpu_usage, ); + self.recorders.remove(®ion_id); + } else if is_unified_read_pool_busy && is_region_busy { + LOAD_BASE_SPLIT_EVENT.cpu_load_fit.inc(); + top_cpu_usage.push(region_id); } - self.recorders.remove(®ion_id); } else { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NOT_READY_TO_SPLIT]) - .inc(); + LOAD_BASE_SPLIT_EVENT.not_ready_to_split.inc(); } top_qps.push(qps); } + // Check if the top CPU usage region could be split. + // TODO: avoid unnecessary split by introducing the feedback mechanism from PD. + if !top_cpu_usage.is_empty() { + // Only split the top CPU region when the gRPC poll is not busy. + if !is_grpc_poll_busy { + // Calculate by using the latest CPU usage. + top_cpu_usage.sort_unstable_by(|a, b| { + let cpu_usage_a = self.recorders.get(a).unwrap().cpu_usage; + let cpu_usage_b = self.recorders.get(b).unwrap().cpu_usage; + cpu_usage_b.partial_cmp(&cpu_usage_a).unwrap() + }); + let region_id = top_cpu_usage[0]; + let recorder = self.recorders.get_mut(®ion_id).unwrap(); + if recorder.hottest_key_range.is_some() { + split_infos.push(SplitInfo::with_start_end_key( + region_id, + recorder.peer.clone(), + recorder + .hottest_key_range + .as_ref() + .unwrap() + .start_key + .clone(), + recorder.hottest_key_range.as_ref().unwrap().end_key.clone(), + )); + LOAD_BASE_SPLIT_EVENT.ready_to_split_cpu_top.inc(); + info!("load base split region"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().start_key), + "end_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().end_key), + "cpu_usage" => recorder.cpu_usage, + ); + } else { + LOAD_BASE_SPLIT_EVENT.empty_hottest_key_range.inc(); + } + } else { + LOAD_BASE_SPLIT_EVENT.unable_to_split_cpu_top.inc(); + } + // Clean up the rest top CPU usage recorders. + for region_id in top_cpu_usage { + self.recorders.remove(®ion_id); + } + } + (top_qps.into_vec(), split_infos) } @@ -645,19 +912,42 @@ impl AutoSplitController { }); } - pub fn refresh_cfg(&mut self) { + pub fn refresh_and_check_cfg(&mut self) -> SplitConfigChange { + let mut cfg_change = SplitConfigChange::Noop; if let Some(incoming) = self.cfg_tracker.any_new() { + if self.cfg.region_cpu_overload_threshold_ratio <= 0.0 + && incoming.region_cpu_overload_threshold_ratio > 0.0 + { + cfg_change = SplitConfigChange::UpdateRegionCpuCollector(true); + } + if self.cfg.region_cpu_overload_threshold_ratio > 0.0 + && incoming.region_cpu_overload_threshold_ratio <= 0.0 + { + cfg_change = SplitConfigChange::UpdateRegionCpuCollector(false); + } self.cfg = incoming.clone(); } + // Adjust with the size change of the Unified Read Pool. + if let Some(rx) = &self.unified_read_pool_scale_receiver { + if let Ok(max_thread_count) = rx.try_recv() { + self.max_unified_read_pool_thread_count = max_thread_count; + } + } + cfg_change } } #[cfg(test)] mod tests { + use online_config::{ConfigChange, ConfigManager, ConfigValue}; + use resource_metering::{RawRecord, TagInfos}; + use tikv_util::config::VersionTrack; use txn_types::Key; use super::*; - use crate::store::{util::build_key_range, worker::split_config::DEFAULT_SAMPLE_NUM}; + use crate::store::worker::split_config::{ + DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + }; enum Position { Left, @@ -769,7 +1059,7 @@ mod tests { build_key_range(b"a", b"b", false), build_key_range(b"b", b"c", false), ]; - check_split( + check_split_key( b"raw key", vec![gen_read_stats(1, raw_key_ranges.clone())], vec![b"b"], @@ -783,14 +1073,14 @@ mod tests { build_key_range(key_a.as_encoded(), key_b.as_encoded(), false), build_key_range(key_b.as_encoded(), key_c.as_encoded(), false), ]; - check_split( + check_split_key( b"encoded key", vec![gen_read_stats(1, encoded_key_ranges.clone())], vec![key_b.as_encoded()], ); // mix mode - check_split( + check_split_key( b"mix key", vec![ gen_read_stats(1, raw_key_ranges), @@ -800,7 +1090,7 @@ mod tests { ); // test distribution with contained key - for _i in 0..100 { + for _ in 0..100 { let key_ranges = vec![ build_key_range(b"a", b"k", false), build_key_range(b"b", b"j", false), @@ -809,7 +1099,7 @@ mod tests { build_key_range(b"e", b"g", false), build_key_range(b"f", b"f", false), ]; - check_split( + check_split_key( b"isosceles triangle", vec![gen_read_stats(1, key_ranges)], vec![], @@ -823,7 +1113,7 @@ mod tests { build_key_range(b"e", b"j", false), build_key_range(b"f", b"k", false), ]; - check_split( + check_split_key( b"parallelogram", vec![gen_read_stats(1, key_ranges)], vec![], @@ -833,7 +1123,7 @@ mod tests { build_key_range(b"a", b"l", false), build_key_range(b"a", b"m", false), ]; - check_split( + check_split_key( b"right-angle trapezoid", vec![gen_read_stats(1, key_ranges)], vec![], @@ -843,46 +1133,159 @@ mod tests { build_key_range(b"a", b"l", false), build_key_range(b"b", b"l", false), ]; - check_split( + check_split_key( b"right-angle trapezoid", vec![gen_read_stats(1, key_ranges)], vec![], ); } + + // test high CPU usage + fail::cfg("mock_grpc_poll_is_not_busy", "return(0)").unwrap(); + fail::cfg("mock_unified_read_pool_is_busy", "return(0)").unwrap(); + fail::cfg("mock_region_is_busy", "return(0)").unwrap(); + for _ in 0..100 { + let key_ranges = vec![ + build_key_range(b"a", b"l", false), + build_key_range(b"a", b"m", false), + ]; + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges.clone(), vec![100, 200])], + b"a", + b"m", + ); + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges, vec![200, 100])], + b"a", + b"l", + ); + + let key_ranges = vec![ + build_key_range(b"a", b"l", false), + build_key_range(b"b", b"l", false), + ]; + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges.clone(), vec![100, 200])], + b"b", + b"l", + ); + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges, vec![200, 100])], + b"a", + b"l", + ); + } + fail::remove("mock_grpc_poll_is_not_busy"); + fail::remove("mock_unified_read_pool_is_busy"); + fail::remove("mock_region_is_busy"); } - fn check_split(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { + fn check_split_key(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { + let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); hub.cfg.qps_threshold = 1; hub.cfg.sample_threshold = 0; for i in 0..10 { - let (_, split_infos) = hub.flush(qps_stats.clone()); - if (i + 1) % hub.cfg.detect_times == 0 { - assert_eq!( - split_infos.len(), - split_keys.len(), - "mode: {:?}", - String::from_utf8(Vec::from(mode)).unwrap() - ); - for obtain in &split_infos { - let mut equal = false; - for expect in &split_keys { - if obtain.split_key.cmp(&expect.to_vec()) == Ordering::Equal { - equal = true; - break; - } + let (_, split_infos) = + hub.flush(qps_stats.clone(), vec![], &ThreadInfoStatistics::default()); + if (i + 1) % hub.cfg.detect_times != 0 { + continue; + } + // Check the split key. + assert_eq!(split_infos.len(), split_keys.len(), "mode: {:?}", mode); + for obtain in &split_infos { + let mut equal = false; + for expect in &split_keys { + if obtain.split_key.as_ref().unwrap().cmp(&expect.to_vec()) == Ordering::Equal { + equal = true; + break; } - assert!( - equal, - "mode: {:?}", - String::from_utf8(Vec::from(mode)).unwrap() - ); } + assert!(equal, "mode: {:?}", mode); } } } + fn check_split_key_range( + mode: &[u8], + qps_stats: Vec, + cpu_stats: Vec>, + start_key: &[u8], + end_key: &[u8], + ) { + let mode = String::from_utf8(Vec::from(mode)).unwrap(); + let mut hub = AutoSplitController::default(); + hub.cfg.qps_threshold = 1; + hub.cfg.sample_threshold = 0; + + for i in 0..10 { + let (_, split_infos) = hub.flush( + qps_stats.clone(), + cpu_stats.clone(), + &ThreadInfoStatistics::default(), + ); + if (i + 1) % hub.cfg.detect_times != 0 { + continue; + } + assert_eq!(split_infos.len(), 1, "mode: {:?}", mode); + // Check the split key range. + let split_info = &split_infos[0]; + assert!(split_info.split_key.is_none(), "mode: {:?}", mode); + assert_eq!( + split_info + .start_key + .as_ref() + .unwrap() + .cmp(&start_key.to_vec()), + Ordering::Equal, + "mode: {:?}", + mode + ); + assert_eq!( + split_info.end_key.as_ref().unwrap().cmp(&end_key.to_vec()), + Ordering::Equal, + "mode: {:?}", + mode + ); + } + } + + fn gen_cpu_stats( + region_id: u64, + key_ranges: Vec, + cpu_times: Vec, + ) -> Arc { + let mut raw_records = RawRecords::default(); + raw_records.duration = Duration::from_millis(100); + for (idx, key_range) in key_ranges.iter().enumerate() { + let key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id, + peer_id: 0, + key_ranges: vec![(key_range.start_key.clone(), key_range.end_key.clone())], + extra_attachment: vec![], + }); + raw_records.records.insert( + key_range_tag.clone(), + RawRecord { + cpu_time: cpu_times[idx], + read_keys: 0, + write_keys: 0, + }, + ); + } + Arc::new(raw_records) + } + #[test] fn test_sample_key_num() { let mut hub = AutoSplitController::default(); @@ -913,7 +1316,7 @@ mod tests { ); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec); + hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); } // Test the empty key ranges. @@ -926,7 +1329,7 @@ mod tests { qps_stats.add_query_num(1, &Peer::default(), KeyRange::default(), QueryKind::Get); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec); + hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); } fn check_sample_length(key_ranges: Vec>) { @@ -1201,6 +1604,253 @@ mod tests { qps_stats } + #[test] + fn test_refresh_and_check_cfg() { + let split_config = SplitConfig::default(); + let mut split_cfg_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(split_config))); + let mut auto_split_controller = + AutoSplitController::new(split_cfg_manager.clone(), 0, 0, None); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::Noop, + ); + assert_eq!( + auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + // Set to zero. + dispatch_split_cfg_change( + &mut split_cfg_manager, + "region_cpu_overload_threshold_ratio", + ConfigValue::F64(0.0), + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::UpdateRegionCpuCollector(false), + ); + assert_eq!( + auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio, + 0.0 + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::Noop, + ); + // Set to non-zero. + dispatch_split_cfg_change( + &mut split_cfg_manager, + "region_cpu_overload_threshold_ratio", + ConfigValue::F64(REGION_CPU_OVERLOAD_THRESHOLD_RATIO), + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::UpdateRegionCpuCollector(true), + ); + assert_eq!( + auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::Noop, + ); + } + + fn dispatch_split_cfg_change( + split_cfg_manager: &mut SplitConfigManager, + cfg_name: &str, + cfg_value: ConfigValue, + ) { + let mut config_change = ConfigChange::new(); + config_change.insert(String::from(cfg_name), cfg_value); + split_cfg_manager.dispatch(config_change).unwrap(); + } + + #[test] + fn test_collect_cpu_stats() { + let auto_split_controller = AutoSplitController::default(); + let region_cpu_map = auto_split_controller.collect_cpu_stats(vec![]); + assert!(region_cpu_map.is_empty()); + + let ab_key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![(b"a".to_vec(), b"b".to_vec())], + extra_attachment: vec![], + }); + let cd_key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![(b"c".to_vec(), b"d".to_vec())], + extra_attachment: vec![], + }); + let multiple_key_ranges_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![ + (b"a".to_vec(), b"b".to_vec()), + (b"c".to_vec(), b"d".to_vec()), + ], + extra_attachment: vec![], + }); + let empty_key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![], + extra_attachment: vec![], + }); + + let test_cases = vec![ + (300, 150, 50, 50, Some(build_key_range(b"a", b"b", false))), + (150, 300, 50, 50, Some(build_key_range(b"c", b"d", false))), + (150, 50, 300, 50, Some(build_key_range(b"a", b"b", false))), + (50, 150, 300, 50, Some(build_key_range(b"c", b"d", false))), + (150, 50, 50, 300, Some(build_key_range(b"a", b"b", false))), + (100, 0, 0, 0, Some(build_key_range(b"a", b"b", false))), + (50, 0, 0, 50, Some(build_key_range(b"a", b"b", false))), + (50, 0, 0, 100, Some(build_key_range(b"a", b"b", false))), + (50, 0, 50, 0, Some(build_key_range(b"a", b"b", false))), + (0, 50, 50, 0, Some(build_key_range(b"c", b"d", false))), + (0, 0, 0, 100, None), + (0, 0, 0, 0, None), + ]; + for (i, test_case) in test_cases.iter().enumerate() { + let mut raw_records = RawRecords::default(); + raw_records.duration = Duration::from_millis(100); + // ["a", "b"] with (test_case.0)ms CPU time. + raw_records.records.insert( + ab_key_range_tag.clone(), + RawRecord { + cpu_time: test_case.0, + read_keys: 0, + write_keys: 0, + }, + ); + // ["c", "d"] with (test_case.1)ms CPU time. + raw_records.records.insert( + cd_key_range_tag.clone(), + RawRecord { + cpu_time: test_case.1, + read_keys: 0, + write_keys: 0, + }, + ); + // Multiple key ranges with (test_case.2)ms CPU time. + raw_records.records.insert( + multiple_key_ranges_tag.clone(), + RawRecord { + cpu_time: test_case.2, + read_keys: 0, + write_keys: 0, + }, + ); + // Empty key range with (test_case.3)ms CPU time. + raw_records.records.insert( + empty_key_range_tag.clone(), + RawRecord { + cpu_time: test_case.3, + read_keys: 0, + write_keys: 0, + }, + ); + let region_cpu_map = + auto_split_controller.collect_cpu_stats(vec![Arc::new(raw_records)]); + assert_eq!( + region_cpu_map.len(), + 1, + "test_collect_cpu_stats case: {}", + i + ); + assert_eq!( + region_cpu_map.get(&1).unwrap().0, + (test_case.0 + test_case.1 + test_case.2 + test_case.3) as f64 / 100.0, + "test_collect_cpu_stats case: {}", + i + ); + assert_eq!( + region_cpu_map.get(&1).unwrap().1, + test_case.4, + "test_collect_cpu_stats case: {}", + i + ); + } + } + + #[test] + fn test_avg_grpc_thread_cpu_usage_calculation() { + let mut auto_split_controller = AutoSplitController::default(); + let detect_times = auto_split_controller.cfg.detect_times as f64; + for grpc_thread_usage in 1..=5 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [1.0, 2.0, 3.0, 4.0, 5.0].iter().sum::() / 5.0, + ); + for grpc_thread_usage in 6..=10 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + .iter() + .sum::() + / detect_times, + ); + for grpc_thread_usage in 11..=15 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0] + .iter() + .sum::() + / detect_times, + ); + for grpc_thread_usage in 1..=10 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + .iter() + .sum::() + / detect_times, + ); + // Change the `detect_times` to a smaller value. + auto_split_controller.cfg.detect_times = 5; + let detect_times = auto_split_controller.cfg.detect_times as f64; + auto_split_controller.update_grpc_thread_usage(11.0); + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [7.0, 8.0, 9.0, 10.0, 11.0].iter().sum::() / detect_times, + ); + // Change the `detect_times` to a bigger value. + auto_split_controller.cfg.detect_times = 6; + let detect_times = auto_split_controller.cfg.detect_times as f64; + auto_split_controller.update_grpc_thread_usage(12.0); + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [7.0, 8.0, 9.0, 10.0, 11.0, 12.0].iter().sum::() / detect_times, + ); + auto_split_controller.update_grpc_thread_usage(13.0); + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [8.0, 9.0, 10.0, 11.0, 12.0, 13.0].iter().sum::() / detect_times, + ); + } + #[bench] fn samples_evaluate(b: &mut test::Bencher) { let mut samples = Samples(vec![Sample::new(b"c")]); @@ -1218,7 +1868,11 @@ mod tests { } b.iter(|| { let mut hub = AutoSplitController::default(); - hub.flush(other_qps_stats.clone()); + hub.flush( + other_qps_stats.clone(), + vec![], + &ThreadInfoStatistics::default(), + ); }); } diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index e781fbc1f75..db3c0643cb7 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -23,40 +23,41 @@ test-engines-rocksdb = ["tikv/test-engines-rocksdb"] test-engines-panic = ["tikv/test-engines-panic"] [dependencies] -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -engine_traits = { path = "../engine_traits", default-features = false } +engine_traits = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored"] } +grpcio = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +online_config = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } -security = { path = "../security", default-features = false } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +raftstore = { workspace = true } +security = { workspace = true } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tikv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } [dev-dependencies] -engine_rocks = { path = "../engine_rocks", default-features = false } -panic_hook = { path = "../panic_hook" } +engine_rocks = { workspace = true } +panic_hook = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tempfile = "3.0" -test_raftstore = { path = "../test_raftstore", default-features = false } -test_util = { path = "../test_util", default-features = false } -tikv_kv = { path = "../tikv_kv" } +test_raftstore = { workspace = true } +test_sst_importer = { workspace = true } +test_util = { workspace = true } +tikv_kv = { workspace = true } [[test]] name = "integrations" diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index ddc52443cec..65d1c1139c6 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -1,10 +1,11 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp, ffi::CString, sync::{ atomic::{AtomicI32, Ordering}, - Arc, Mutex as StdMutex, + Arc, }, time::Duration, }; @@ -14,96 +15,99 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use fail::fail_point; use futures::{compat::Future01CompatExt, future::select_all, FutureExt, TryFutureExt}; -use grpcio::{ChannelBuilder, Environment}; +use grpcio::{ChannelBuilder, Environment, Error as GrpcError, RpcStatusCode}; use kvproto::{ - kvrpcpb::{CheckLeaderRequest, LeaderInfo}, + kvrpcpb::{CheckLeaderRequest, CheckLeaderResponse}, metapb::{Peer, PeerRole}, tikvpb::TikvClient, }; use pd_client::PdClient; use protobuf::Message; -use raftstore::store::{fsm::StoreMeta, util::RegionReadProgressRegistry}; +use raftstore::{ + router::CdcHandle, + store::{msg::Callback, util::RegionReadProgressRegistry}, +}; use security::SecurityManager; -use tikv_util::{info, time::Instant, timer::SteadyTimer, worker::Scheduler}; +use tikv_util::{ + info, + sys::thread::ThreadBuildWrapper, + time::{Instant, SlowTimer}, + timer::SteadyTimer, + worker::Scheduler, +}; use tokio::{ runtime::{Builder, Runtime}, - sync::Mutex, + sync::{Mutex, Notify}, }; use txn_types::TimeStamp; -use crate::{endpoint::Task, metrics::*, util}; +use crate::{endpoint::Task, metrics::*}; -const DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS: u64 = 5_000; // 5s +const DEFAULT_CHECK_LEADER_TIMEOUT_DURATION: Duration = Duration::from_secs(5); // 5s -pub struct AdvanceTsWorker { - store_meta: Arc>, - region_read_progress: RegionReadProgressRegistry, +pub struct AdvanceTsWorker { pd_client: Arc, + advance_ts_interval: Duration, timer: SteadyTimer, worker: Runtime, - scheduler: Scheduler>, - /// The concurrency manager for transactions. It's needed for CDC to check locks when - /// calculating resolved_ts. + scheduler: Scheduler, + /// The concurrency manager for transactions. It's needed for CDC to check + /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, - // store_id -> client - tikv_clients: Arc>>, - env: Arc, - security_mgr: Arc, } -impl AdvanceTsWorker { +impl AdvanceTsWorker { pub fn new( + advance_ts_interval: Duration, pd_client: Arc, - scheduler: Scheduler>, - store_meta: Arc>, - region_read_progress: RegionReadProgressRegistry, + scheduler: Scheduler, concurrency_manager: ConcurrencyManager, - env: Arc, - security_mgr: Arc, ) -> Self { let worker = Builder::new_multi_thread() .thread_name("advance-ts") .worker_threads(1) .enable_time() + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); Self { - env, - security_mgr, scheduler, pd_client, worker, + advance_ts_interval, timer: SteadyTimer::default(), - store_meta, - region_read_progress, concurrency_manager, - tikv_clients: Arc::new(Mutex::new(HashMap::default())), } } } -impl AdvanceTsWorker { - pub fn advance_ts_for_regions(&self, regions: Vec) { - if regions.is_empty() { - return; - } +impl AdvanceTsWorker { + // Advance ts asynchronously and register RegisterAdvanceEvent when its done. + pub fn advance_ts_for_regions( + &self, + regions: Vec, + mut leader_resolver: LeadershipResolver, + advance_ts_interval: Duration, + advance_notify: Arc, + ) { + let cm = self.concurrency_manager.clone(); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); - let cm: ConcurrencyManager = self.concurrency_manager.clone(); - let env = self.env.clone(); - let security_mgr = self.security_mgr.clone(); - let store_meta = self.store_meta.clone(); - let tikv_clients = self.tikv_clients.clone(); - let region_read_progress = self.region_read_progress.clone(); + let timeout = self.timer.delay(advance_ts_interval); + let min_timeout = self.timer.delay(cmp::min( + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION, + self.advance_ts_interval, + )); let fut = async move { - // Ignore get tso errors since we will retry every `advance_ts_interval`. + // Ignore get tso errors since we will retry every `advdance_ts_interval`. let mut min_ts = pd_client.get_tso().await.unwrap_or_default(); - // Sync with concurrency manager so that it can work correctly when optimizations - // like async commit is enabled. - // Note: This step must be done before scheduling `Task::MinTS` task, and the - // resolver must be checked in or after `Task::MinTS`' execution. + // Sync with concurrency manager so that it can work correctly when + // optimizations like async commit is enabled. + // Note: This step must be done before scheduling `Task::MinTs` task, and the + // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { if min_mem_lock_ts < min_ts { @@ -111,202 +115,338 @@ impl AdvanceTsWorker { } } - let regions = region_resolved_ts_store( - regions, - store_meta, - region_read_progress, - pd_client, - security_mgr, - env, - tikv_clients, - min_ts, - ) - .await; - + let regions = leader_resolver.resolve(regions, min_ts).await; if !regions.is_empty() { - if let Err(e) = scheduler.schedule(Task::AdvanceResolvedTs { + if let Err(e) = scheduler.schedule(Task::ResolvedTsAdvanced { regions, ts: min_ts, }) { info!("failed to schedule advance event"; "err" => ?e); } } - }; - self.worker.spawn(fut); - } - pub fn register_next_event(&self, advance_ts_interval: Duration, cfg_version: usize) { - let scheduler = self.scheduler.clone(); - let timeout = self.timer.delay(advance_ts_interval); - let fut = async move { - let _ = timeout.compat().await; - if let Err(e) = scheduler.schedule(Task::RegisterAdvanceEvent { cfg_version }) { - info!("failed to schedule register advance event"; "err" => ?e); + futures::select! { + _ = timeout.compat().fuse() => (), + // Skip wait timeout if a notify is arrived. + _ = advance_notify.notified().fuse() => (), + }; + // Wait min timeout to prevent from overloading advancing resolved ts. + let _ = min_timeout.compat().await; + + // NB: We must schedule the leader resolver even if there is no region, + // otherwise we can not advance resolved ts next time. + if let Err(e) = scheduler.schedule(Task::AdvanceResolvedTs { leader_resolver }) { + error!("failed to schedule register advance event"; "err" => ?e); } }; self.worker.spawn(fut); } } -// Confirms leadership of region peer before trying to advance resolved ts. -// This function broadcasts a special message to all stores, gets the leader id of them to confirm whether -// current peer has a quorum which accepts its leadership. -pub async fn region_resolved_ts_store( - regions: Vec, - store_meta: Arc>, - region_read_progress: RegionReadProgressRegistry, +pub struct LeadershipResolver { + tikv_clients: Mutex>, pd_client: Arc, - security_mgr: Arc, env: Arc, - tikv_clients: Arc>>, - min_ts: TimeStamp, -) -> Vec { - PENDING_RTS_COUNT.inc(); - defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| regions.clone()); - - let store_id = match store_meta.lock().unwrap().store_id { - Some(id) => id, - None => return vec![], - }; + security_mgr: Arc, + region_read_progress: RegionReadProgressRegistry, + store_id: u64, - // store_id -> leaders info, record the request to each stores - let mut store_map: HashMap> = HashMap::default(); - // region_id -> region, cache the information of regions - let mut region_map: HashMap> = HashMap::default(); - // region_id -> peers id, record the responses - let mut resp_map: HashMap> = HashMap::default(); - // region_id -> `(Vec, LeaderInfo)` - let info_map = region_read_progress.dump_leader_infos(®ions); - let mut valid_regions = HashSet::default(); - - for (region_id, (peer_list, leader_info)) in info_map { - let leader_id = leader_info.get_peer_id(); - // Check if the leader in this store - if util::find_store_id(&peer_list, leader_id) != Some(store_id) { - continue; + // store_id -> check leader request, record the request to each stores. + store_req_map: HashMap, + // region_id -> region, cache the information of regions. + region_map: HashMap>, + // region_id -> peers id, record the responses. + resp_map: HashMap>, + checking_regions: HashSet, + valid_regions: HashSet, + + gc_interval: Duration, + last_gc_time: Instant, +} + +impl LeadershipResolver { + pub fn new( + store_id: u64, + pd_client: Arc, + env: Arc, + security_mgr: Arc, + region_read_progress: RegionReadProgressRegistry, + gc_interval: Duration, + ) -> LeadershipResolver { + LeadershipResolver { + tikv_clients: Mutex::default(), + store_id, + pd_client, + env, + security_mgr, + region_read_progress, + + store_req_map: HashMap::default(), + region_map: HashMap::default(), + resp_map: HashMap::default(), + valid_regions: HashSet::default(), + checking_regions: HashSet::default(), + last_gc_time: Instant::now_coarse(), + gc_interval, } - let mut unvotes = 0; - for peer in &peer_list { - if peer.store_id == store_id && peer.id == leader_id { - resp_map.entry(region_id).or_default().push(store_id); - } else { - // It's still necessary to check leader on learners even if they don't vote - // because performing stale read on learners require it. - store_map - .entry(peer.store_id) - .or_default() - .push(leader_info.clone()); - if peer.get_role() != PeerRole::Learner { - unvotes += 1; - } - } + } + + fn gc(&mut self) { + let now = Instant::now_coarse(); + if now - self.last_gc_time > self.gc_interval { + self.store_req_map = HashMap::default(); + self.region_map = HashMap::default(); + self.resp_map = HashMap::default(); + self.valid_regions = HashSet::default(); + self.checking_regions = HashSet::default(); + self.last_gc_time = now; + } + } + + fn clear(&mut self) { + for v in self.store_req_map.values_mut() { + v.regions.clear(); + v.ts = 0; + } + for v in self.region_map.values_mut() { + v.clear(); } - // Check `region_has_quorum` here because `store_map` can be empty, - // in which case `region_has_quorum` won't be called any more. - if unvotes == 0 && region_has_quorum(&peer_list, &resp_map[®ion_id]) { - valid_regions.insert(region_id); - } else { - region_map.insert(region_id, peer_list); + for v in self.resp_map.values_mut() { + v.clear(); } + self.checking_regions.clear(); + self.valid_regions.clear(); } - // Approximate `LeaderInfo` size - let leader_info_size = store_map - .values() - .next() - .map_or(0, |regions| regions[0].compute_size()); - let store_count = store_map.len(); - let mut stores: Vec<_> = store_map - .into_iter() - .map(|(to_store, regions)| { - let tikv_clients = tikv_clients.clone(); + + // Confirms leadership of region peer before trying to advance resolved ts. + // This function broadcasts a special message to all stores, gets the leader id + // of them to confirm whether current peer has a quorum which accepts its + // leadership. + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + if regions.is_empty() { + return regions; + } + + // Clear previous result before resolving. + self.clear(); + // GC when necessary to prevent memory leak. + self.gc(); + + PENDING_RTS_COUNT.inc(); + defer!(PENDING_RTS_COUNT.dec()); + fail_point!("before_sync_replica_read_state", |_| regions.clone()); + + let store_id = self.store_id; + let valid_regions = &mut self.valid_regions; + let region_map = &mut self.region_map; + let resp_map = &mut self.resp_map; + let store_req_map = &mut self.store_req_map; + let checking_regions = &mut self.checking_regions; + for region_id in ®ions { + checking_regions.insert(*region_id); + } + self.region_read_progress.with(|registry| { + for (region_id, read_progress) in registry { + if !checking_regions.contains(region_id) { + continue; + } + let core = read_progress.get_core(); + let local_leader_info = core.get_local_leader_info(); + let leader_id = local_leader_info.get_leader_id(); + let leader_store_id = local_leader_info.get_leader_store_id(); + let peer_list = local_leader_info.get_peers(); + // Check if the leader in this store + if leader_store_id != Some(store_id) { + continue; + } + let leader_info = core.get_leader_info(); + + let mut unvotes = 0; + for peer in peer_list { + if peer.store_id == store_id && peer.id == leader_id { + resp_map + .entry(*region_id) + .or_insert_with(|| Vec::with_capacity(peer_list.len())) + .push(store_id); + } else { + // It's still necessary to check leader on learners even if they don't vote + // because performing stale read on learners require it. + store_req_map + .entry(peer.store_id) + .or_insert_with(|| { + let mut req = CheckLeaderRequest::default(); + req.regions = Vec::with_capacity(registry.len()).into(); + req + }) + .regions + .push(leader_info.clone()); + if peer.get_role() != PeerRole::Learner { + unvotes += 1; + } + } + } + // Check `region_has_quorum` here because `store_map` can be empty, + // in which case `region_has_quorum` won't be called any more. + if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { + valid_regions.insert(*region_id); + } else { + region_map + .entry(*region_id) + .or_insert_with(|| Vec::with_capacity(peer_list.len())) + .extend_from_slice(peer_list); + } + } + }); + + let env = &self.env; + let pd_client = &self.pd_client; + let security_mgr = &self.security_mgr; + let tikv_clients = &self.tikv_clients; + // Approximate `LeaderInfo` size + let leader_info_size = store_req_map + .values() + .find(|req| !req.regions.is_empty()) + .map_or(0, |req| req.regions[0].compute_size()); + let store_count = store_req_map.len(); + let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); + for (store_id, req) in store_req_map { + if req.regions.is_empty() { + continue; + } let env = env.clone(); - let pd_client = pd_client.clone(); - let security_mgr = security_mgr.clone(); - let region_num = regions.len() as u32; + let to_store = *store_id; + let region_num = req.regions.len() as u32; CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); // Check leadership for `regions` on `to_store`. - async move { + let rpc = async move { PENDING_CHECK_LEADER_REQ_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); - let client = - get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients.clone()) - .await - .map_err(|e| { - (to_store, e.retryable(), format!("[get tikv client] {}", e)) - })?; - - let mut req = CheckLeaderRequest::default(); - req.set_regions(regions.into()); + let client = get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) + .await + .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; + + // Set min_ts in the request. req.set_ts(min_ts.into_inner()); - let start = Instant::now_coarse(); + let slow_timer = SlowTimer::default(); defer!({ - let elapsed = start.saturating_elapsed(); slow_log!( - elapsed, + T + slow_timer, "check leader rpc costs too long, to_store: {}", to_store ); + let elapsed = slow_timer.saturating_elapsed(); RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC .with_label_values(&["rpc"]) .observe(elapsed.as_secs_f64()); }); - let rpc = client - .check_leader_async(&req) - .map_err(|e| (to_store, true, format!("[rpc create failed]{}", e)))?; + let rpc = match client.check_leader_async(req) { + Ok(rpc) => rpc, + Err(GrpcError::RpcFailure(status)) + if status.code() == RpcStatusCode::UNIMPLEMENTED => + { + // Some stores like TiFlash don't implement it. + return Ok((to_store, CheckLeaderResponse::default())); + } + Err(e) => return Err((to_store, true, format!("[rpc create failed]{}", e))), + }; + PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let resp = tokio::time::timeout(timeout, rpc) .map_err(|e| (to_store, true, format!("[timeout] {}", e))) .await? .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; Ok((to_store, resp)) } - .boxed() - }) - .collect(); - let start = Instant::now_coarse(); + .boxed(); + check_leader_rpcs.push(rpc); + } + let start = Instant::now_coarse(); - defer!({ - RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC - .with_label_values(&["all"]) - .observe(start.saturating_elapsed_secs()); - }); - for _ in 0..store_count { - // Use `select_all` to avoid the process getting blocked when some TiKVs were down. - let (res, _, remains) = select_all(stores).await; - stores = remains; - match res { - Ok((to_store, resp)) => { - for region_id in resp.regions { - if let Some(r) = region_map.get(®ion_id) { - let resps = resp_map.entry(region_id).or_default(); - resps.push(to_store); - if region_has_quorum(r, resps) { - valid_regions.insert(region_id); - } + defer!({ + RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC + .with_label_values(&["all"]) + .observe(start.saturating_elapsed_secs()); + }); + let rpc_count = check_leader_rpcs.len(); + for _ in 0..rpc_count { + // Use `select_all` to avoid the process getting blocked when some + // TiKVs were down. + let (res, _, remains) = select_all(check_leader_rpcs).await; + check_leader_rpcs = remains; + match res { + Ok((to_store, resp)) => { + for region_id in resp.regions { + resp_map + .entry(region_id) + .or_insert_with(|| Vec::with_capacity(store_count)) + .push(to_store); } } - } - Err((to_store, reconnect, err)) => { - info!("check leader failed"; "error" => ?err, "to_store" => to_store); - if reconnect { - tikv_clients.lock().await.remove(&to_store); + Err((to_store, reconnect, err)) => { + info!("check leader failed"; "error" => ?err, "to_store" => to_store); + if reconnect { + self.tikv_clients.lock().await.remove(&to_store); + } } } } - // Return early if all regions had already got quorum. - if valid_regions.len() == regions.len() { - // break here because all regions have quorum, - // so there is no need waiting for other stores to respond. - break; + for (region_id, prs) in region_map { + if prs.is_empty() { + // The peer had the leadership before, but now it's no longer + // the case. Skip checking the region. + continue; + } + if let Some(resp) = resp_map.get(region_id) { + if resp.is_empty() { + // No response, maybe the peer lost leadership. + continue; + } + if region_has_quorum(prs, resp) { + valid_regions.insert(*region_id); + } + } } + self.valid_regions.drain().collect() } - valid_regions.into_iter().collect() +} + +pub async fn resolve_by_raft(regions: Vec, min_ts: TimeStamp, cdc_handle: T) -> Vec +where + T: 'static + CdcHandle, + E: KvEngine, +{ + let mut reqs = Vec::with_capacity(regions.len()); + for region_id in regions { + let cdc_handle_clone = cdc_handle.clone(); + let req = async move { + let (tx, rx) = tokio::sync::oneshot::channel(); + let callback = Callback::read(Box::new(move |resp| { + let resp = if resp.response.get_header().has_error() { + None + } else { + Some(region_id) + }; + if tx.send(resp).is_err() { + error!("cdc send tso response failed"; "region_id" => region_id); + } + })); + if let Err(e) = cdc_handle_clone.check_leadership(region_id, callback) { + warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); + return None; + } + rx.await.unwrap_or(None) + }; + reqs.push(req); + } + + let resps = futures::future::join_all(reqs).await; + resps.into_iter().flatten().collect::>() } fn region_has_quorum(peers: &[Peer], stores: &[u64]) -> bool { @@ -361,10 +501,10 @@ static CONN_ID: AtomicI32 = AtomicI32::new(0); async fn get_tikv_client( store_id: u64, - pd_client: Arc, - security_mgr: Arc, + pd_client: &Arc, + security_mgr: &SecurityManager, env: Arc, - tikv_clients: Arc>>, + tikv_clients: &Mutex>, ) -> pd_client::Result { { let clients = tikv_clients.lock().await; @@ -372,7 +512,7 @@ async fn get_tikv_client( return Ok(client); } } - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let store = tokio::time::timeout(timeout, pd_client.get_store_async(store_id)) .await .map_err(|e| pd_client::Error::Other(Box::new(e))) @@ -384,9 +524,118 @@ async fn get_tikv_client( CString::new("random id").unwrap(), CONN_ID.fetch_add(1, Ordering::SeqCst), ); - let channel = security_mgr.connect(cb, &store.address); + let channel = security_mgr.connect(cb, &store.peer_address); let cli = TikvClient::new(channel); clients.insert(store_id, cli.clone()); RTS_TIKV_CLIENT_INIT_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); Ok(cli) } + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }, + time::Duration, + }; + + use grpcio::{self, ChannelBuilder, EnvBuilder, Server, ServerBuilder}; + use kvproto::{metapb::Region, tikvpb::Tikv, tikvpb_grpc::create_tikv}; + use pd_client::PdClient; + use raftstore::store::util::RegionReadProgress; + use tikv_util::store::new_peer; + + use super::*; + + #[derive(Clone)] + struct MockTikv { + req_tx: Sender, + } + + impl Tikv for MockTikv { + fn check_leader( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: CheckLeaderRequest, + sink: ::grpcio::UnarySink, + ) { + self.req_tx.send(req).unwrap(); + ctx.spawn(async { + sink.success(CheckLeaderResponse::default()).await.unwrap(); + }) + } + } + + struct MockPdClient {} + impl PdClient for MockPdClient {} + + fn new_rpc_suite(env: Arc) -> (Server, TikvClient, Receiver) { + let (tx, rx) = channel(); + let tikv_service = MockTikv { req_tx: tx }; + let builder = ServerBuilder::new(env.clone()).register_service(create_tikv(tikv_service)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(env).connect(&addr); + let client = TikvClient::new(channel); + (server, client, rx) + } + + #[tokio::test] + async fn test_resolve_leader_request_size() { + let env = Arc::new(EnvBuilder::new().build()); + let (mut server, tikv_client, rx) = new_rpc_suite(env.clone()); + + let mut region1 = Region::default(); + region1.id = 1; + region1.peers.push(new_peer(1, 1)); + region1.peers.push(new_peer(2, 11)); + let progress1 = RegionReadProgress::new(®ion1, 1, 1, 1); + progress1.update_leader_info(1, 1, ®ion1); + + let mut region2 = Region::default(); + region2.id = 2; + region2.peers.push(new_peer(1, 2)); + region2.peers.push(new_peer(2, 22)); + let progress2 = RegionReadProgress::new(®ion2, 1, 1, 2); + progress2.update_leader_info(2, 2, ®ion2); + + let mut leader_resolver = LeadershipResolver::new( + 1, // store id + Arc::new(MockPdClient {}), + env.clone(), + Arc::new(SecurityManager::default()), + RegionReadProgressRegistry::new(), + Duration::from_secs(1), + ); + leader_resolver + .tikv_clients + .lock() + .await + .insert(2 /* store id */, tikv_client); + leader_resolver + .region_read_progress + .insert(1, Arc::new(progress1)); + leader_resolver + .region_read_progress + .insert(2, Arc::new(progress2)); + + leader_resolver.resolve(vec![1, 2], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 2); + + // Checking one region only send 1 region in request. + leader_resolver.resolve(vec![1], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 1); + + // Checking zero region does not send request. + leader_resolver.resolve(vec![], TimeStamp::new(1)).await; + rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + + let _ = server.shutdown().await; + } +} diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 8d1cd6e2a90..47d14304112 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -33,6 +33,7 @@ pub enum ChangeRow { commit_ts: TimeStamp, value: Option, }, + IngestSsT, } #[allow(clippy::large_enum_variant)] @@ -49,6 +50,7 @@ impl ChangeLog { .map(|cmd| { let Cmd { index, + term: _, mut request, mut response, } = cmd; @@ -57,8 +59,11 @@ impl ChangeLog { let flags = WriteBatchFlags::from_bits_truncate(request.get_header().get_flags()); let is_one_pc = flags.contains(WriteBatchFlags::ONE_PC); - let changes = group_row_changes(request.requests.into()); - let rows = Self::encode_rows(changes, is_one_pc); + let (changes, has_ingest_sst) = group_row_changes(request.requests.into()); + let mut rows = Self::encode_rows(changes, is_one_pc); + if has_ingest_sst { + rows.push(ChangeRow::IngestSsT); + } ChangeLog::Rows { index, rows } } else { ChangeLog::Admin(request.take_admin_request().get_cmd_type()) @@ -134,7 +139,8 @@ impl ChangeLog { pub(crate) fn decode_write(key: &[u8], value: &[u8], is_apply: bool) -> Option { let write = WriteRef::parse(value).ok()?.to_owned(); - // Drop the record it self but keep only the overlapped rollback information if gc_fence exists. + // Drop the record it self but keep only the overlapped rollback information if + // gc_fence exists. if is_apply && write.gc_fence.is_some() { // `gc_fence` is set means the write record has been rewritten. // Currently the only case is writing overlapped_rollback. And in this case @@ -188,12 +194,17 @@ struct RowChange { default: Option, } -fn group_row_changes(requests: Vec) -> HashMap { +fn group_row_changes(requests: Vec) -> (HashMap, bool) { let mut changes: HashMap = HashMap::default(); - // The changes about default cf was recorded here and need to be matched with a `write` or a `lock`. + // The changes about default cf was recorded here and need to be matched with a + // `write` or a `lock`. let mut unmatched_default = HashMap::default(); + let mut has_ingest_sst = false; for mut req in requests { match req.get_cmd_type() { + CmdType::IngestSst => { + has_ingest_sst = true; + } CmdType::Put => { let mut put = req.take_put(); let key = Key::from_encoded(put.take_key()); @@ -250,11 +261,11 @@ fn group_row_changes(requests: Vec) -> HashMap { row.default = Some(default); } } - changes + (changes, has_ingest_sst) } -/// Filter non-lock related data (i.e `default_cf` data), the implement is subject to -/// how `group_row_changes` and `encode_rows` encode `ChangeRow` +/// Filter non-lock related data (i.e `default_cf` data), the implement is +/// subject to how `group_row_changes` and `encode_rows` encode `ChangeRow` pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { if cmd_batch.is_empty() { return None; @@ -271,7 +282,7 @@ pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { CmdType::Delete => req.get_delete().cf.as_str(), _ => "", }; - cf == CF_LOCK || cf == CF_WRITE + cf == CF_LOCK || cf == CF_WRITE || req.get_cmd_type() == CmdType::IngestSst }); cmd.request.set_requests(requests.into()); } @@ -283,13 +294,15 @@ pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { #[cfg(test)] mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::AssertionLevel; + use kvproto::{ + kvrpcpb::{AssertionLevel, PrewriteRequestPessimisticAction::*}, + raft_cmdpb::{CmdType, Request}, + }; use tikv::storage::{ kv::{MockEngineBuilder, TestEngineBuilder}, - lock_manager::DummyLockManager, mvcc::{tests::write, Mutation, MvccTxn, SnapshotReader}, txn::{ - commands::one_pc_commit_ts, prewrite, tests::*, CommitKind, TransactionKind, + commands::one_pc_commit, prewrite, tests::*, CommitKind, TransactionKind, TransactionProperties, }, Engine, @@ -302,30 +315,37 @@ mod tests { #[test] fn test_cmd_encode() { let rocks_engine = TestEngineBuilder::new().build().unwrap(); - let engine = MockEngineBuilder::from_rocks_engine(rocks_engine).build(); + let mut engine = MockEngineBuilder::from_rocks_engine(rocks_engine).build(); - let reqs = vec![Modify::Put("default", Key::from_raw(b"k1"), b"v1".to_vec()).into()]; - assert!(ChangeLog::encode_rows(group_row_changes(reqs), false).is_empty()); + let mut reqs = vec![Modify::Put("default", Key::from_raw(b"k1"), b"v1".to_vec()).into()]; + let mut req = Request::default(); + req.set_cmd_type(CmdType::IngestSst); + reqs.push(req); + let (changes, has_ingest_sst) = group_row_changes(reqs); + assert_eq!(has_ingest_sst, true); + assert!(ChangeLog::encode_rows(changes, false).is_empty()); - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 1); - must_commit(&engine, b"k1", 1, 2); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 1); + must_commit(&mut engine, b"k1", 1, 2); - must_prewrite_put(&engine, b"k1", b"v2", b"k1", 3); - must_rollback(&engine, b"k1", 3, false); + must_prewrite_put(&mut engine, b"k1", b"v2", b"k1", 3); + must_rollback(&mut engine, b"k1", 3, false); - must_prewrite_put(&engine, b"k1", &[b'v'; 512], b"k1", 4); - must_commit(&engine, b"k1", 4, 5); + must_prewrite_put(&mut engine, b"k1", &[b'v'; 512], b"k1", 4); + must_commit(&mut engine, b"k1", 4, 5); - must_prewrite_put(&engine, b"k1", b"v3", b"k1", 5); - must_rollback(&engine, b"k1", 5, false); + must_prewrite_put(&mut engine, b"k1", b"v3", b"k1", 5); + must_rollback(&mut engine, b"k1", 5, false); let k1 = Key::from_raw(b"k1"); let rows: Vec<_> = engine .take_last_modifies() .into_iter() .flat_map(|m| { - let reqs = m.into_iter().map(Into::into).collect(); - ChangeLog::encode_rows(group_row_changes(reqs), false) + let reqs: Vec = m.into_iter().map(Into::into).collect(); + let (changes, has_ingest_sst) = group_row_changes(reqs); + assert_eq!(has_ingest_sst, false); + ChangeLog::encode_rows(changes, false) }) .collect(); @@ -399,20 +419,24 @@ mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_put(k1.clone(), b"v4".to_vec()), &None, - false, + SkipPessimisticCheck, + None, ) .unwrap(); - one_pc_commit_ts(true, &mut txn, 10.into(), &DummyLockManager); + one_pc_commit(true, &mut txn, 10.into()); write(&engine, &Default::default(), txn.into_modifies()); let one_pc_row = engine .take_last_modifies() .into_iter() .flat_map(|m| { let reqs = m.into_iter().map(Into::into).collect(); - ChangeLog::encode_rows(group_row_changes(reqs), true) + let (changes, has_ingest_sst) = group_row_changes(reqs); + assert_eq!(has_ingest_sst, false); + ChangeLog::encode_rows(changes, true) }) .last() .unwrap(); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 06fcb8c6860..23be4a62fc5 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -12,32 +12,34 @@ use std::{ }; use concurrency_manager::ConcurrencyManager; -use engine_traits::{KvEngine, Snapshot}; +use engine_traits::KvEngine; use grpcio::Environment; use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; use pd_client::PdClient; use raftstore::{ - coprocessor::{CmdBatch, ObserveHandle, ObserveID}, - router::RaftStoreRouter, + coprocessor::{CmdBatch, ObserveHandle, ObserveId}, + router::CdcHandle, store::{ - fsm::StoreMeta, + fsm::store::StoreRegionMeta, util::{self, RegionReadProgress, RegionReadProgressRegistry}, - RegionSnapshot, }, }; use security::SecurityManager; use tikv::config::ResolvedTsConfig; -use tikv_util::worker::{Runnable, RunnableWithTimer, Scheduler}; +use tikv_util::{ + warn, + worker::{Runnable, RunnableWithTimer, Scheduler}, +}; +use tokio::sync::Notify; use txn_types::{Key, TimeStamp}; use crate::{ - advance::AdvanceTsWorker, + advance::{AdvanceTsWorker, LeadershipResolver}, cmd::{ChangeLog, ChangeRow}, metrics::*, resolver::Resolver, scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, - sinker::{CmdSinker, SinkCmd}, }; enum ResolverStatus { @@ -63,8 +65,8 @@ enum PendingLock { } // Records information related to observed region. -// observe_id is used for avoiding ABA problems in incremental scan task, advance resolved ts task, -// and command observing. +// observe_id is used for avoiding ABA problems in incremental scan task, +// advance resolved ts task, and command observing. struct ObserveRegion { meta: Region, handle: ObserveHandle, @@ -88,6 +90,10 @@ impl ObserveRegion { } } + fn read_progress(&self) -> &RegionReadProgress { + self.resolver.read_progress.as_ref().unwrap() + } + fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> std::result::Result<(), String> { match &mut self.resolver_status { ResolverStatus::Pending { @@ -106,8 +112,9 @@ impl ObserveRegion { continue; } ChangeLog::Admin(req_type) => { - // TODO: for admin cmd that won't change the region meta like peer list and key range - // (i.e. `CompactLog`, `ComputeHash`) we may not need to return error + // TODO: for admin cmd that won't change the region meta like peer list + // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to + // return error return Err(format!( "region met admin command {:?} while initializing resolver", req_type @@ -133,6 +140,7 @@ impl ObserveRegion { }), // One pc command do not contains any lock, so just skip it ChangeRow::OnePc { .. } => {} + ChangeRow::IngestSsT => {} }); assert!( *tracked_index < *index, @@ -167,8 +175,9 @@ impl ObserveRegion { "region met split/merge command, stop tracking since key range changed, wait for re-register"; "req_type" => ?req_type, ); - // Stop tracking so that `tracked_index` larger than the split/merge command index won't be published - // untill `RegionUpdate` event trigger the region re-register and re-scan the new key range + // Stop tracking so that `tracked_index` larger than the split/merge + // command index won't be published until `RegionUpdate` event + // trigger the region re-register and re-scan the new key range self.resolver.stop_tracking(); } _ => { @@ -188,7 +197,12 @@ impl ObserveRegion { .resolver .untrack_lock(&key.to_raw().unwrap(), Some(*index)), // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => {} + ChangeRow::OnePc { .. } => { + self.resolver.update_tracked_index(*index); + } + ChangeRow::IngestSsT => { + self.resolver.update_tracked_index(*index); + } }); } } @@ -252,65 +266,68 @@ impl ObserveRegion { } } -pub struct Endpoint { +pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, - cfg_version: usize, - store_meta: Arc>, + advance_notify: Arc, + store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, - scheduler: Scheduler>, - sinker: C, - advance_worker: AdvanceTsWorker, + scheduler: Scheduler, + advance_worker: AdvanceTsWorker, _phantom: PhantomData<(T, E)>, } -impl Endpoint +impl Endpoint where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, - C: CmdSinker, + S: StoreRegionMeta, { pub fn new( cfg: &ResolvedTsConfig, - scheduler: Scheduler>, - raft_router: T, - store_meta: Arc>, + scheduler: Scheduler, + cdc_handle: T, + store_meta: Arc>, pd_client: Arc, concurrency_manager: ConcurrencyManager, env: Arc, security_mgr: Arc, - sinker: C, ) -> Self { let (region_read_progress, store_id) = { let meta = store_meta.lock().unwrap(); - (meta.region_read_progress.clone(), meta.store_id) + (meta.region_read_progress().clone(), meta.store_id()) }; let advance_worker = AdvanceTsWorker::new( - pd_client, + cfg.advance_ts_interval.0, + pd_client.clone(), scheduler.clone(), - store_meta.clone(), - region_read_progress.clone(), concurrency_manager, + ); + let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, cdc_handle); + let store_resolver_gc_interval = Duration::from_secs(60); + let leader_resolver = LeadershipResolver::new( + store_id, + pd_client.clone(), env, security_mgr, + region_read_progress.clone(), + store_resolver_gc_interval, ); - let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, raft_router); let ep = Self { - store_id, + store_id: Some(store_id), cfg: cfg.clone(), - cfg_version: 0, + advance_notify: Arc::new(Notify::new()), scheduler, store_meta, region_read_progress, advance_worker, scanner_pool, - sinker, regions: HashMap::default(), _phantom: PhantomData::default(), }; - ep.register_advance_event(ep.cfg_version); + ep.handle_advance_resolved_ts(leader_resolver); ep } @@ -337,6 +354,9 @@ where ResolverStatus::Pending { ref cancelled, .. } => cancelled.clone(), ResolverStatus::Ready => panic!("resolved ts illeagal created observe region"), }; + observe_region + .read_progress() + .update_advance_resolved_ts_notify(self.advance_notify.clone()); self.regions.insert(region_id, observe_region); let scan_task = self.build_scan_task(region, observe_handle, cancelled); @@ -421,15 +441,17 @@ where return; } // TODO: may not need to re-register region for some cases: - // - `Split/BatchSplit`, which can be handled by remove out-of-range locks from the `Resolver`'s lock heap + // - `Split/BatchSplit`, which can be handled by remove out-of-range locks from + // the `Resolver`'s lock heap // - `PrepareMerge` and `RollbackMerge`, the key range is unchanged self.deregister_region(region_id); self.register_region(incoming_region); } } - // This function is corresponding to RegionDestroyed event that can be only scheduled by observer. - // To prevent destroying region for wrong peer, it should check the region epoch at first. + // This function is corresponding to RegionDestroyed event that can be only + // scheduled by observer. To prevent destroying region for wrong peer, it + // should check the region epoch at first. fn region_destroyed(&mut self, region: Region) { if let Some(observe_region) = self.regions.get(®ion.id) { if util::compare_region_epoch( @@ -454,7 +476,7 @@ where } // Deregister current observed region and try to register it again. - fn re_register_region(&mut self, region_id: u64, observe_id: ObserveID, cause: String) { + fn re_register_region(&mut self, region_id: u64, observe_id: ObserveId, cause: String) { if let Some(observe_region) = self.regions.get(®ion_id) { if observe_region.handle.id != observe_id { warn!("resolved ts deregister region failed due to observe_id not match"); @@ -471,8 +493,8 @@ where let region; { let meta = self.store_meta.lock().unwrap(); - match meta.regions.get(®ion_id) { - Some(r) => region = r.clone(), + match meta.reader(region_id) { + Some(r) => region = r.region.as_ref().clone(), None => return, } } @@ -480,76 +502,55 @@ where } } - // Try to advance resolved ts. + // Update advanced resolved ts. // Must ensure all regions are leaders at the point of ts. - fn advance_resolved_ts(&mut self, regions: Vec, ts: TimeStamp) { + fn handle_resolved_ts_advanced(&mut self, regions: Vec, ts: TimeStamp) { if regions.is_empty() { return; } - - let mut min_ts = TimeStamp::max(); for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let resolved_ts = observe_region.resolver.resolve(ts); - if resolved_ts < min_ts { - min_ts = resolved_ts; - } + let _ = observe_region.resolver.resolve(ts); } } } - self.sinker.sink_resolved_ts(regions, ts); } - // Tracking or untracking locks with incoming commands that corresponding observe id is valid. + // Tracking or untracking locks with incoming commands that corresponding + // observe id is valid. #[allow(clippy::drop_ref)] - fn handle_change_log( - &mut self, - cmd_batch: Vec, - snapshot: Option>, - ) { + fn handle_change_log(&mut self, cmd_batch: Vec) { let size = cmd_batch.iter().map(|b| b.size()).sum::(); RTS_CHANNEL_PENDING_CMD_BYTES.sub(size as i64); - let logs = cmd_batch - .into_iter() - .filter_map(|batch| { - if !batch.is_empty() { - if let Some(observe_region) = self.regions.get_mut(&batch.region_id) { - let observe_id = batch.rts_id; - let region_id = observe_region.meta.id; - if observe_region.handle.id == observe_id { - let logs = ChangeLog::encode_change_log(region_id, batch); - if let Err(e) = observe_region.track_change_log(&logs) { - drop(observe_region); - self.re_register_region(region_id, observe_id, e) - } - return Some(SinkCmd { - region_id, - observe_id, - logs, - }); - } else { - debug!("resolved ts CmdBatch discarded"; - "region_id" => batch.region_id, - "observe_id" => ?batch.rts_id, - "current" => ?observe_region.handle.id, - ); - } + for batch in cmd_batch { + if batch.is_empty() { + continue; + } + if let Some(observe_region) = self.regions.get_mut(&batch.region_id) { + let observe_id = batch.rts_id; + let region_id = observe_region.meta.id; + if observe_region.handle.id == observe_id { + let logs = ChangeLog::encode_change_log(region_id, batch); + if let Err(e) = observe_region.track_change_log(&logs) { + drop(observe_region); + self.re_register_region(region_id, observe_id, e); } + } else { + debug!("resolved ts CmdBatch discarded"; + "region_id" => batch.region_id, + "observe_id" => ?batch.rts_id, + "current" => ?observe_region.handle.id, + ); } - None - }) - .collect(); - match snapshot { - Some(snap) => self.sinker.sink_cmd_with_old_value(logs, snap), - None => self.sinker.sink_cmd(logs), + } } } fn handle_scan_locks( &mut self, region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, entries: Vec, apply_index: u64, ) { @@ -565,44 +566,40 @@ where } } - fn register_advance_event(&self, cfg_version: usize) { - // Ignore advance event that registered with previous `advance_ts_interval` config - if self.cfg_version != cfg_version { - return; - } + fn handle_advance_resolved_ts(&self, leader_resolver: LeadershipResolver) { let regions = self.regions.keys().into_iter().copied().collect(); - self.advance_worker.advance_ts_for_regions(regions); - self.advance_worker - .register_next_event(self.cfg.advance_ts_interval.0, self.cfg_version); + self.advance_worker.advance_ts_for_regions( + regions, + leader_resolver, + self.cfg.advance_ts_interval.0, + self.advance_notify.clone(), + ); } fn handle_change_config(&mut self, change: ConfigChange) { let prev = format!("{:?}", self.cfg); - let prev_advance_ts_interval = self.cfg.advance_ts_interval; - self.cfg.update(change); - if self.cfg.advance_ts_interval != prev_advance_ts_interval { - // Increase the `cfg_version` to reject advance event that registered before - self.cfg_version += 1; - // Advance `resolved-ts` immediately after `advance_ts_interval` changed - self.register_advance_event(self.cfg_version); + if let Err(e) = self.cfg.update(change) { + warn!("resolved-ts config fails"; "error" => ?e); + } else { + self.advance_notify.notify_waiters(); + info!( + "resolved-ts config changed"; + "prev" => prev, + "current" => ?self.cfg, + ); } - info!( - "resolved-ts config changed"; - "prev" => prev, - "current" => ?self.cfg, - ); } fn get_or_init_store_id(&mut self) -> Option { self.store_id.or_else(|| { let meta = self.store_meta.lock().unwrap(); - self.store_id = meta.store_id; - meta.store_id + self.store_id = Some(meta.store_id()); + self.store_id }) } } -pub enum Task { +pub enum Task { RegionUpdated(Region), RegionDestroyed(Region), RegisterRegion { @@ -613,23 +610,22 @@ pub enum Task { }, ReRegisterRegion { region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, cause: String, }, - RegisterAdvanceEvent { - cfg_version: usize, - }, AdvanceResolvedTs { + leader_resolver: LeadershipResolver, + }, + ResolvedTsAdvanced { regions: Vec, ts: TimeStamp, }, ChangeLog { cmd_batch: Vec, - snapshot: Option>, }, ScanLocks { region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, entries: Vec, apply_index: u64, }, @@ -638,7 +634,7 @@ pub enum Task { }, } -impl fmt::Debug for Task { +impl fmt::Debug for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut de = f.debug_struct("ResolvedTsTask"); match self { @@ -668,7 +664,7 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("cause", &cause) .finish(), - Task::AdvanceResolvedTs { + Task::ResolvedTsAdvanced { ref regions, ref ts, } => de @@ -688,9 +684,7 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("apply_index", &apply_index) .finish(), - Task::RegisterAdvanceEvent { .. } => { - de.field("name", &"register_advance_event").finish() - } + Task::AdvanceResolvedTs { .. } => de.field("name", &"advance_resolved_ts").finish(), Task::ChangeConfig { ref change } => de .field("name", &"change_config") .field("change", &change) @@ -699,21 +693,21 @@ impl fmt::Debug for Task { } } -impl fmt::Display for Task { +impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self) } } -impl Runnable for Endpoint +impl Runnable for Endpoint where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, - C: CmdSinker, + S: StoreRegionMeta, { - type Task = Task; + type Task = Task; - fn run(&mut self, task: Task) { + fn run(&mut self, task: Task) { debug!("run resolved-ts task"; "task" => ?task); match task { Task::RegionDestroyed(region) => self.region_destroyed(region), @@ -725,32 +719,33 @@ where observe_id, cause, } => self.re_register_region(region_id, observe_id, cause), - Task::AdvanceResolvedTs { regions, ts } => self.advance_resolved_ts(regions, ts), - Task::ChangeLog { - cmd_batch, - snapshot, - } => self.handle_change_log(cmd_batch, snapshot), + Task::AdvanceResolvedTs { leader_resolver } => { + self.handle_advance_resolved_ts(leader_resolver) + } + Task::ResolvedTsAdvanced { regions, ts } => { + self.handle_resolved_ts_advanced(regions, ts) + } + Task::ChangeLog { cmd_batch } => self.handle_change_log(cmd_batch), Task::ScanLocks { region_id, observe_id, entries, apply_index, } => self.handle_scan_locks(region_id, observe_id, entries, apply_index), - Task::RegisterAdvanceEvent { cfg_version } => self.register_advance_event(cfg_version), Task::ChangeConfig { change } => self.handle_change_config(change), } } } -pub struct ResolvedTsConfigManager(Scheduler>); +pub struct ResolvedTsConfigManager(Scheduler); -impl ResolvedTsConfigManager { - pub fn new(scheduler: Scheduler>) -> ResolvedTsConfigManager { +impl ResolvedTsConfigManager { + pub fn new(scheduler: Scheduler) -> ResolvedTsConfigManager { ResolvedTsConfigManager(scheduler) } } -impl ConfigManager for ResolvedTsConfigManager { +impl ConfigManager for ResolvedTsConfigManager { fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { if let Err(e) = self.0.schedule(Task::ChangeConfig { change }) { error!("failed to schedule ChangeConfig task"; "err" => ?e); @@ -761,11 +756,11 @@ impl ConfigManager for ResolvedTsConfigManager { const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s -impl RunnableWithTimer for Endpoint +impl RunnableWithTimer for Endpoint where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, - C: CmdSinker, + S: StoreRegionMeta, { fn on_timeout(&mut self) { let store_id = self.get_or_init_store_id(); @@ -773,8 +768,7 @@ where let (mut oldest_leader_ts, mut oldest_leader_region) = (u64::MAX, 0); self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { - let (peers, leader_info) = read_progress.dump_leader_info(); - let leader_store_id = crate::util::find_store_id(&peers, leader_info.peer_id); + let (leader_info, leader_store_id) = read_progress.dump_leader_info(); let ts = leader_info.get_read_state().get_safe_ts(); if ts == 0 { zero_ts_count += 1; diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index 172efbb9c18..eef1211a580 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -1,13 +1,16 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! Resolved TS is a timestamp that represents the lower bonud of incoming Commit TS +//! Resolved TS is a timestamp that represents the lower bound of incoming +//! Commit TS // and the upper bound of outgoing Commit TS. -//! Through this timestamp we can get a consistent view in the transaction level. +//! Through this timestamp we can get a consistent view in the transaction +//! level. //! //! To maintain a correct Resolved TS, these premises must be satisfied: -//! 1. Tracing all locks in the region, use the minimal Start TS as Resolved TS. -//! 2. If there is not any lock, use the latest timestamp as Resolved TS. -//! 3. Resolved TS must be advanced by the region leader after it has applied on its term. +//! - Tracing all locks in the region, use the minimal Start TS as Resolved TS. +//! - If there is not any lock, use the latest timestamp as Resolved TS. +//! - Resolved TS must be advanced by the region leader after it has applied on +//! its term. #![feature(box_patterns)] #![feature(result_flattening)] @@ -24,8 +27,6 @@ mod observer; pub use observer::*; mod advance; pub use advance::*; -mod sinker; -pub use sinker::*; mod endpoint; pub use endpoint::*; mod errors; @@ -34,4 +35,3 @@ mod scanner; pub use scanner::*; mod metrics; pub use metrics::*; -mod util; diff --git a/components/resolved_ts/src/observer.rs b/components/resolved_ts/src/observer.rs index 483649c36e7..7421beaad85 100644 --- a/components/resolved_ts/src/observer.rs +++ b/components/resolved_ts/src/observer.rs @@ -8,18 +8,19 @@ use tikv_util::worker::Scheduler; use crate::{cmd::lock_only_filter, endpoint::Task, metrics::RTS_CHANNEL_PENDING_CMD_BYTES}; -pub struct Observer { - scheduler: Scheduler>, +pub struct Observer { + scheduler: Scheduler, } -impl Observer { - pub fn new(scheduler: Scheduler>) -> Self { +impl Observer { + pub fn new(scheduler: Scheduler) -> Self { Observer { scheduler } } - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - // The `resolved-ts` cmd observer will `mem::take` the `Vec`, use a low priority - // to let it be the last observer and avoid affecting other observers + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { + // The `resolved-ts` cmd observer will `mem::take` the `Vec`, use a + // low priority to let it be the last observer and avoid affecting other + // observers coprocessor_host .registry .register_cmd_observer(1000, BoxCmdObserver::new(self.clone())); @@ -32,7 +33,7 @@ impl Observer { } } -impl Clone for Observer { +impl Clone for Observer { fn clone(&self) -> Self { Self { scheduler: self.scheduler.clone(), @@ -40,9 +41,9 @@ impl Clone for Observer { } } -impl Coprocessor for Observer {} +impl Coprocessor for Observer {} -impl CmdObserver for Observer { +impl CmdObserver for Observer { fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -63,7 +64,6 @@ impl CmdObserver for Observer { RTS_CHANNEL_PENDING_CMD_BYTES.add(size as i64); if let Err(e) = self.scheduler.schedule(Task::ChangeLog { cmd_batch: cmd_batches, - snapshot: None, }) { info!("failed to schedule change log event"; "err" => ?e); } @@ -81,10 +81,11 @@ impl CmdObserver for Observer { } } -impl RoleObserver for Observer { +impl RoleObserver for Observer { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { // Stop to advance resolved ts after peer steps down to follower or candidate. - // Do not need to check observe id because we expect all role change events are scheduled in order. + // Do not need to check observe id because we expect all role change events are + // scheduled in order. if role_change.state != StateRole::Leader { if let Err(e) = self.scheduler.schedule(Task::DeRegisterRegion { region_id: ctx.region().id, @@ -95,16 +96,16 @@ impl RoleObserver for Observer { } } -impl RegionChangeObserver for Observer { +impl RegionChangeObserver for Observer { fn on_region_changed( &self, ctx: &mut ObserverContext<'_>, event: RegionChangeEvent, role: StateRole, ) { - // If the peer is not leader, it must has not registered the observe region or it is deregistering - // the observe region, so don't need to send `RegionUpdated`/`RegionDestroyed` to update the observe - // region + // If the peer is not leader, it must has not registered the observe region or + // it is deregistering the observe region, so don't need to send + // `RegionUpdated`/`RegionDestroyed` to update the observe region if role != StateRole::Leader { return; } @@ -137,7 +138,6 @@ impl RegionChangeObserver for Observer { mod test { use std::time::Duration; - use engine_rocks::RocksSnapshot; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::*; use tikv::storage::kv::TestEngineBuilder; @@ -154,7 +154,7 @@ mod test { cmd } - fn expect_recv(rx: &mut ReceiverWrapper>, data: Vec) { + fn expect_recv(rx: &mut ReceiverWrapper, data: Vec) { if data.is_empty() { match rx.recv_timeout(Duration::from_millis(10)) { Err(std::sync::mpsc::RecvTimeoutError::Timeout) => return, @@ -185,7 +185,7 @@ mod test { put_cf(CF_WRITE, b"k7", b"v"), put_cf(CF_WRITE, b"k8", b"v"), ]; - let mut cmd = Cmd::new(0, RaftCmdRequest::default(), RaftCmdResponse::default()); + let mut cmd = Cmd::new(0, 0, RaftCmdRequest::default(), RaftCmdResponse::default()); cmd.request.mut_requests().clear(); for put in &data { cmd.request.mut_requests().push(put.clone()); diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 1669a0e8b65..b341c546940 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -21,7 +21,7 @@ pub struct Resolver { // The highest index `Resolver` had been tracked tracked_index: u64, // The region read progress used to utilize `resolved_ts` to serve stale read request - read_progress: Option>, + pub(crate) read_progress: Option>, // The timestamps that advance the resolved_ts when there is no more write. min_ts: TimeStamp, // Whether the `Resolver` is stopped @@ -150,7 +150,8 @@ impl Resolver { /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. pub fn resolve(&mut self, min_ts: TimeStamp) -> TimeStamp { - // The `Resolver` is stopped, not need to advance, just return the current `resolved_ts` + // The `Resolver` is stopped, not need to advance, just return the current + // `resolved_ts` if self.stopped { return self.resolved_ts; } @@ -161,7 +162,6 @@ impl Resolver { // No more commit happens before the ts. let new_resolved_ts = cmp::min(min_start_ts, min_ts); - if self.resolved_ts >= new_resolved_ts { let label = if has_lock { "has_lock" } else { "stale_ts" }; RTS_RESOLVED_FAIL_ADVANCE_VEC diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index c52bf3bf166..a8c4e5bb44f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -6,20 +6,16 @@ use engine_traits::KvEngine; use futures::compat::Future01CompatExt; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb::Region}; use raftstore::{ - coprocessor::{ObserveHandle, ObserveID}, - router::RaftStoreRouter, - store::{ - fsm::ChangeObserver, - msg::{Callback, SignificantMsg}, - RegionSnapshot, - }, + coprocessor::{ObserveHandle, ObserveId}, + router::CdcHandle, + store::{fsm::ChangeObserver, msg::Callback, RegionSnapshot}, }; use tikv::storage::{ kv::{ScanMode as MvccScanMode, Snapshot}, mvcc::{DeltaScanner, MvccReader, ScannerBuilder}, txn::{TxnEntry, TxnEntryScanner}, }; -use tikv_util::{time::Instant, timer::GLOBAL_TIMER_HANDLE}; +use tikv_util::{sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE}; use tokio::runtime::{Builder, Runtime}; use txn_types::{Key, Lock, LockType, TimeStamp}; @@ -33,7 +29,7 @@ const GET_SNAPSHOT_RETRY_TIME: u32 = 3; const GET_SNAPSHOT_RETRY_BACKOFF_STEP: Duration = Duration::from_millis(25); pub type BeforeStartCallback = Box; -pub type OnErrorCallback = Box; +pub type OnErrorCallback = Box; pub type OnEntriesCallback = Box, u64) + Send>; pub type IsCancelledCallback = Box bool + Send>; @@ -64,30 +60,32 @@ pub enum ScanEntry { #[derive(Clone)] pub struct ScannerPool { workers: Arc, - raft_router: T, + cdc_handle: T, _phantom: PhantomData, } -impl, E: KvEngine> ScannerPool { - pub fn new(count: usize, raft_router: T) -> Self { +impl, E: KvEngine> ScannerPool { + pub fn new(count: usize, cdc_handle: T) -> Self { let workers = Arc::new( Builder::new_multi_thread() .thread_name("inc-scan") .worker_threads(count) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(), ); Self { workers, - raft_router, + cdc_handle, _phantom: PhantomData::default(), } } pub fn spawn_task(&self, mut task: ScanTask) { - let raft_router = self.raft_router.clone(); + let cdc_handle = self.cdc_handle.clone(); let fut = async move { - let snap = match Self::get_snapshot(&mut task, raft_router).await { + let snap = match Self::get_snapshot(&mut task, cdc_handle).await { Ok(snap) => snap, Err(e) => { warn!("resolved_ts scan get snapshot failed"; "err" => ?e); @@ -179,7 +177,7 @@ impl, E: KvEngine> ScannerPool { async fn get_snapshot( task: &mut ScanTask, - raft_router: T, + cdc_handle: T, ) -> Result> { let mut last_err = None; for retry_times in 0..=GET_SNAPSHOT_RETRY_TIME { @@ -199,18 +197,17 @@ impl, E: KvEngine> ScannerPool { } let (cb, fut) = tikv_util::future::paired_future_callback(); let change_cmd = ChangeObserver::from_rts(task.region.id, task.handle.clone()); - raft_router.significant_send( + cdc_handle.capture_change( task.region.id, - SignificantMsg::CaptureChange { - cmd: change_cmd, - region_epoch: task.region.get_region_epoch().clone(), - callback: Callback::Read(Box::new(cb)), - }, + task.region.get_region_epoch().clone(), + change_cmd, + Callback::read(Box::new(cb)), )?; let mut resp = box_try!(fut.await); if resp.response.get_header().has_error() { let err = resp.response.take_header().take_error(); - // These two errors can't handled by retrying since the epoch and observe id is unchanged + // These two errors can't handled by retrying since the epoch and observe id is + // unchanged if err.has_epoch_not_match() || err.get_message().contains("stale observe id") { return Err(Error::request(err)); } diff --git a/components/resolved_ts/src/sinker.rs b/components/resolved_ts/src/sinker.rs deleted file mode 100644 index 29eebce02ed..00000000000 --- a/components/resolved_ts/src/sinker.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::marker::PhantomData; - -use engine_traits::Snapshot; -use raftstore::{coprocessor::ObserveID, store::RegionSnapshot}; -use txn_types::TimeStamp; - -use crate::cmd::ChangeLog; - -pub struct SinkCmd { - pub region_id: u64, - pub observe_id: ObserveID, - pub logs: Vec, -} - -pub trait CmdSinker: Send { - fn sink_cmd(&mut self, sink_cmd: Vec); - - fn sink_cmd_with_old_value(&mut self, sink_cmd: Vec, snapshot: RegionSnapshot); - - fn sink_resolved_ts(&mut self, regions: Vec, ts: TimeStamp); -} - -pub struct DummySinker(PhantomData); - -impl DummySinker { - pub fn new() -> Self { - Self(PhantomData::default()) - } -} - -impl Default for DummySinker { - fn default() -> Self { - Self::new() - } -} - -impl CmdSinker for DummySinker { - fn sink_cmd(&mut self, _sink_cmd: Vec) {} - - fn sink_cmd_with_old_value(&mut self, _sink_cmd: Vec, _snapshot: RegionSnapshot) {} - - fn sink_resolved_ts(&mut self, _regions: Vec, _ts: TimeStamp) {} -} diff --git a/components/resolved_ts/src/util.rs b/components/resolved_ts/src/util.rs deleted file mode 100644 index 11bc1c547a0..00000000000 --- a/components/resolved_ts/src/util.rs +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use kvproto::metapb::Peer; - -pub fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { - for peer in peer_list { - if peer.id == peer_id { - return Some(peer.store_id); - } - } - None -} diff --git a/components/resolved_ts/tests/failpoints/mod.rs b/components/resolved_ts/tests/failpoints/mod.rs index e734864471a..808f5ed62ff 100644 --- a/components/resolved_ts/tests/failpoints/mod.rs +++ b/components/resolved_ts/tests/failpoints/mod.rs @@ -7,6 +7,7 @@ use kvproto::kvrpcpb::*; use pd_client::PdClient; use test_raftstore::{new_peer, sleep_ms}; pub use testsuite::*; +use tikv_util::config::ReadableDuration; use txn_types::TimeStamp; #[test] @@ -21,7 +22,7 @@ fn test_check_leader_timeout() { mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); suite .cluster .must_transfer_leader(region.id, new_peer(1, 1)); @@ -57,6 +58,16 @@ fn test_report_min_resolved_ts() { fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); fail::cfg("mock_min_resolved_ts_interval", "return(0)").unwrap(); let mut suite = TestSuite::new(1); + // default config is 1s + assert_eq!( + suite + .cluster + .cfg + .tikv + .raft_store + .report_min_resolved_ts_interval, + ReadableDuration::secs(1) + ); let region = suite.cluster.get_region(&[]); let ts1 = suite.cluster.pd_client.get_min_resolved_ts(); @@ -67,7 +78,7 @@ fn test_report_min_resolved_ts() { mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); // Commit let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); @@ -89,6 +100,7 @@ fn test_report_min_resolved_ts() { fn test_report_min_resolved_ts_disable() { fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); + fail::cfg("mock_min_resolved_ts_interval_disable", "return(0)").unwrap(); let mut suite = TestSuite::new(1); let region = suite.cluster.get_region(&[]); let ts1 = suite.cluster.pd_client.get_min_resolved_ts(); @@ -100,7 +112,7 @@ fn test_report_min_resolved_ts_disable() { mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); // Commit let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); @@ -113,5 +125,6 @@ fn test_report_min_resolved_ts_disable() { assert!(ts3 == ts1); fail::remove("mock_tick_interval"); fail::remove("mock_collect_tick_interval"); + fail::remove("mock_min_resolved_ts_interval_disable"); suite.stop(); } diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index 7916d03d8d2..7802108b92b 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -5,9 +5,11 @@ mod testsuite; use std::time::Duration; use futures::executor::block_on; -use kvproto::kvrpcpb::*; +use kvproto::{kvrpcpb::*, metapb::RegionEpoch}; use pd_client::PdClient; +use tempfile::Builder; use test_raftstore::sleep_ms; +use test_sst_importer::*; pub use testsuite::*; #[test] @@ -17,12 +19,12 @@ fn test_resolved_ts_basic() { // Prewrite let (k, v) = (b"k1", b"v"); - let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); let mut mutation = Mutation::default(); mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); // The `resolved-ts` won't be updated due to there is lock on the region, // the `resolved-ts` may not be the `start_ts` of the lock if the `resolved-ts` @@ -52,6 +54,54 @@ fn test_resolved_ts_basic() { let current_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_get_rts_ge(r1.id, current_ts); + // ingest sst + let temp_dir = Builder::new().prefix("test_resolved_ts").tempdir().unwrap(); + let sst_path = temp_dir.path().join("test.sst"); + let sst_range = (0, 100); + + let mut sst_epoch = RegionEpoch::default(); + sst_epoch.set_conf_ver(1); + sst_epoch.set_version(4); + + let (mut meta, data) = gen_sst_file(sst_path, sst_range); + meta.set_region_id(r1.id); + meta.set_region_epoch(sst_epoch); + + suite.upload_sst(r1.id, &meta, &data).unwrap(); + + let tracked_index_before = suite.region_tracked_index(r1.id); + suite.must_ingest_sst(r1.id, meta); + let mut tracked_index_after = suite.region_tracked_index(r1.id); + for _ in 0..10 { + if tracked_index_after > tracked_index_before { + break; + } + tracked_index_after = suite.region_tracked_index(r1.id); + sleep_ms(200) + } + assert!(tracked_index_after > tracked_index_before); + + // 1PC + let tracked_index_before = suite.region_tracked_index(r1.id); + + start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let (k, v) = (b"k2", b"v"); + let mut mutation_1pc = Mutation::default(); + mutation_1pc.set_op(Op::Put); + mutation_1pc.key = k.to_vec(); + mutation_1pc.value = v.to_vec(); + suite.must_kv_prewrite(r1.id, vec![mutation_1pc], k.to_vec(), start_ts, true); + + tracked_index_after = suite.region_tracked_index(r1.id); + for _ in 0..10 { + if tracked_index_after > tracked_index_before { + break; + } + tracked_index_after = suite.region_tracked_index(r1.id); + sleep_ms(200) + } + assert!(tracked_index_after > tracked_index_before); + suite.stop(); } diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 3d7fdb87569..36705f9c015 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -4,11 +4,16 @@ use std::{sync::*, time::Duration}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; -use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment}; -use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; +use futures::{executor::block_on, stream, SinkExt}; +use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment, Result, WriteFlags}; +use kvproto::{ + import_sstpb::{IngestRequest, SstMeta, UploadRequest, UploadResponse}, + import_sstpb_grpc::ImportSstClient, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, + tikvpb::TikvClient, +}; use online_config::ConfigValue; -use raftstore::coprocessor::CoprocessorHost; +use raftstore::{coprocessor::CoprocessorHost, router::CdcRaftRouter}; use resolved_ts::{Observer, Task}; use test_raftstore::*; use tikv::config::ResolvedTsConfig; @@ -22,9 +27,10 @@ pub fn init() { pub struct TestSuite { pub cluster: Cluster, - pub endpoints: HashMap>>, - pub obs: HashMap>, + pub endpoints: HashMap>, + pub obs: HashMap, tikv_cli: HashMap, + import_cli: HashMap, concurrency_managers: HashMap, env: Arc, @@ -34,7 +40,7 @@ impl TestSuite { pub fn new(count: usize) -> Self { let mut cluster = new_server_cluster(1, count); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); Self::with_cluster(count, cluster) } @@ -75,13 +81,12 @@ impl TestSuite { let rts_endpoint = resolved_ts::Endpoint::new( &cfg, worker.scheduler(), - raft_router, + CdcRaftRouter(raft_router), cluster.store_metas[id].clone(), pd_cli.clone(), cm.clone(), env, sim.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); concurrency_managers.insert(*id, cm); worker.start(rts_endpoint); @@ -94,6 +99,7 @@ impl TestSuite { concurrency_managers, env: Arc::new(Environment::new(1)), tikv_cli: HashMap::default(), + import_cli: HashMap::default(), } } @@ -123,6 +129,7 @@ impl TestSuite { muts: Vec, pk: Vec, ts: TimeStamp, + try_one_pc: bool, ) { let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(self.get_context(region_id)); @@ -130,6 +137,7 @@ impl TestSuite { prewrite_req.primary_lock = pk; prewrite_req.start_version = ts.into_inner(); prewrite_req.lock_ttl = prewrite_req.start_version + 1; + prewrite_req.try_one_pc = try_one_pc; let prewrite_resp = self .get_tikv_client(region_id) .kv_prewrite(&prewrite_req) @@ -144,6 +152,9 @@ impl TestSuite { "{:?}", prewrite_resp.get_errors() ); + if try_one_pc { + assert_ne!(prewrite_resp.get_one_pc_commit_ts(), 0); + } } pub fn must_kv_commit( @@ -261,7 +272,9 @@ impl TestSuite { prewrite_req.start_version = ts.into_inner(); prewrite_req.lock_ttl = prewrite_req.start_version + 1; prewrite_req.for_update_ts = for_update_ts.into_inner(); - prewrite_req.mut_is_pessimistic_lock().push(true); + prewrite_req + .mut_pessimistic_actions() + .push(DoPessimisticCheck); let prewrite_resp = self .get_tikv_client(region_id) .kv_prewrite(&prewrite_req) @@ -318,6 +331,19 @@ impl TestSuite { }) } + pub fn get_import_client(&mut self, region_id: u64) -> &ImportSstClient { + let leader = self.cluster.leader_of_region(region_id).unwrap(); + let store_id = leader.get_store_id(); + let addr = self.cluster.sim.rl().get_addr(store_id); + let env = self.env.clone(); + self.import_cli + .entry(leader.get_store_id()) + .or_insert_with(|| { + let channel = ChannelBuilder::new(env).connect(&addr); + ImportSstClient::new(channel) + }) + } + pub fn get_txn_concurrency_manager(&self, store_id: u64) -> Option { self.concurrency_managers.get(&store_id).cloned() } @@ -331,12 +357,26 @@ impl TestSuite { let meta = self.cluster.store_metas[&leader.store_id].lock().unwrap(); Some( meta.region_read_progress - .get_safe_ts(®ion_id) + .get_resolved_ts(®ion_id) .unwrap() .into(), ) } + pub fn region_tracked_index(&mut self, region_id: u64) -> u64 { + for _ in 0..50 { + if let Some(leader) = self.cluster.leader_of_region(region_id) { + let meta = self.cluster.store_metas[&leader.store_id].lock().unwrap(); + if let Some(tracked_index) = meta.region_read_progress.get_tracked_index(®ion_id) + { + return tracked_index; + } + } + sleep_ms(100) + } + panic!("fail to get region tracked index after 50 trys"); + } + pub fn must_get_rts(&mut self, region_id: u64, rts: TimeStamp) { for _ in 0..50 { if let Some(ts) = self.region_resolved_ts(region_id) { @@ -360,4 +400,45 @@ impl TestSuite { } panic!("fail to get greater ts after 50 trys"); } + + pub fn upload_sst( + &mut self, + region_id: u64, + meta: &SstMeta, + data: &[u8], + ) -> Result { + let import = self.get_import_client(region_id); + let mut r1 = UploadRequest::default(); + r1.set_meta(meta.clone()); + let mut r2 = UploadRequest::default(); + r2.set_data(data.to_vec()); + let reqs: Vec<_> = vec![r1, r2] + .into_iter() + .map(|r| Result::Ok((r, WriteFlags::default()))) + .collect(); + let (mut tx, rx) = import.upload().unwrap(); + let mut stream = stream::iter(reqs); + block_on(async move { + tx.send_all(&mut stream).await?; + tx.close().await?; + rx.await + }) + } + + pub fn must_ingest_sst(&mut self, region_id: u64, meta: SstMeta) { + let mut ingest_request = IngestRequest::default(); + ingest_request.set_context(self.get_context(region_id)); + ingest_request.set_sst(meta); + + let ingest_sst_resp = self + .get_import_client(region_id) + .ingest(&ingest_request) + .unwrap(); + + assert!( + !ingest_sst_resp.has_error(), + "{:?}", + ingest_sst_resp.get_error() + ); + } } diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml new file mode 100644 index 00000000000..ec13d9cdbdb --- /dev/null +++ b/components/resource_control/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "resource_control" +version = "0.0.1" +edition = "2021" +publish = false + +[features] +failpoints = ["fail/failpoints"] + +[dependencies] +byteorder = "1.2" +collections = { workspace = true } +crossbeam = "0.8" +crossbeam-skiplist = "0.1" +dashmap = "5.1" +fail = "0.5" +futures = { version = "0.3" } +kvproto = { workspace = true } +lazy_static = "1.0" +online_config = { workspace = true } +parking_lot = "0.12" +pd_client = { workspace = true } +pin-project = "1.0" +prometheus = { version = "0.13", features = ["nightly"] } +protobuf = { version = "2.8", features = ["bytes"] } +serde = { version = "1.0", features = ["derive"] } +slog = { workspace = true } +slog-global = { workspace = true } +test_pd = { workspace = true } +test_pd_client = { workspace = true } +tikv_util = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } + +[dev-dependencies] +rand = "0.8" diff --git a/components/resource_control/src/channel.rs b/components/resource_control/src/channel.rs new file mode 100644 index 00000000000..ccad4aba4bb --- /dev/null +++ b/components/resource_control/src/channel.rs @@ -0,0 +1,224 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +use std::{cell::RefCell, sync::Arc}; + +use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; +use kvproto::kvrpcpb::CommandPri; +use tikv_util::mpsc::priority_queue; + +use crate::ResourceController; + +pub trait ResourceMetered { + // returns the msg consumption of each hash map + fn consume_resource(&self, _: &Arc) -> Option { + None + } +} + +pub fn bounded( + resource_ctl: Option>, + cap: usize, +) -> (Sender, Receiver) { + if let Some(ctl) = resource_ctl { + // TODO: make it bounded + let (tx, rx) = priority_queue::unbounded(); + ( + Sender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + Receiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::bounded(cap); + (Sender::Vanilla(tx), Receiver::Vanilla(rx)) + } +} + +pub fn unbounded( + resource_ctl: Option>, +) -> (Sender, Receiver) { + if let Some(ctl) = resource_ctl { + let (tx, rx) = priority_queue::unbounded(); + ( + Sender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + Receiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::unbounded(); + (Sender::Vanilla(tx), Receiver::Vanilla(rx)) + } +} + +pub enum Sender { + Vanilla(channel::Sender), + Priority { + resource_ctl: Arc, + sender: priority_queue::Sender, + last_msg_group: RefCell, + }, +} + +impl Clone for Sender { + fn clone(&self) -> Self { + match self { + Sender::Vanilla(sender) => Sender::Vanilla(sender.clone()), + Sender::Priority { + resource_ctl, + sender, + .. + } => Sender::Priority { + resource_ctl: resource_ctl.clone(), + sender: sender.clone(), + last_msg_group: RefCell::new(String::new()), + }, + } + } +} + +impl Sender { + // `low_bound` represents the lowest priority that the message can be sent with. + // It's used to make sure messages from one peer are sent in order. + // The returned value is the priority that the message sent with. It is + // calculated by resource controller and compared with `low_bound`. + pub fn send(&self, m: T, low_bound: Option) -> Result, SendError> { + match self { + Sender::Vanilla(sender) => sender.send(m).map(|_| None), + Sender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + let p = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + let priority = if let Some(low_bound) = low_bound { + std::cmp::max(p, low_bound) + } else { + p + }; + sender.send(m, priority).map(|_| Some(priority)) + } + } + } + + pub fn try_send(&self, m: T, low_bound: Option) -> Result, TrySendError> { + match self { + Sender::Vanilla(sender) => sender.try_send(m).map(|_| None), + Sender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + let p = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + let priority = std::cmp::max(p, low_bound.unwrap_or(0)); + sender.try_send(m, priority).map(|_| Some(priority)) + } + } + } + + pub fn consume_msg_resource(&self, msg: &impl ResourceMetered) { + match self { + Sender::Vanilla(_) => {} + Sender::Priority { + resource_ctl, + last_msg_group, + .. + } => { + if let Some(dominant_group) = msg.consume_resource(resource_ctl) { + *last_msg_group.borrow_mut() = dominant_group; + } + } + } + } +} + +pub enum Receiver { + Vanilla(channel::Receiver), + Priority(priority_queue::Receiver), +} + +impl Clone for Receiver { + fn clone(&self) -> Self { + match self { + Receiver::Vanilla(receiver) => Receiver::Vanilla(receiver.clone()), + Receiver::Priority(receiver) => Receiver::Priority(receiver.clone()), + } + } +} + +impl Receiver { + pub fn recv(&self) -> Result { + match self { + Receiver::Vanilla(receiver) => receiver.recv(), + Receiver::Priority(receiver) => receiver.recv(), + } + } + + pub fn try_recv(&self) -> Result { + match self { + Receiver::Vanilla(receiver) => receiver.try_recv(), + Receiver::Priority(receiver) => receiver.try_recv(), + } + } +} + +#[cfg(test)] +mod tests { + use std::{thread, usize}; + + use test::Bencher; + + use super::*; + use crate::ResourceConsumeType; + + struct Msg(usize); + + impl ResourceMetered for Msg { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { + // None + let write_bytes = self.0 as u64; + let group_name = "test".to_owned(); + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + Some(group_name) + } + } + + #[bench] + fn bench_channel(b: &mut Bencher) { + let (tx, rx) = unbounded(Some(Arc::new(ResourceController::new( + "test".to_owned(), + false, + )))); + + let t = thread::spawn(move || { + let mut n2: usize = 0; + loop { + if let Ok(Msg(n)) = rx.recv() { + n2 += n; + } else { + return n2; + } + } + }); + + let mut n1 = 0; + b.iter(|| { + n1 += 1; + let msg = Msg(1); + tx.consume_msg_resource(&msg); + tx.send(msg, None).unwrap(); + }); + + drop(tx); + let n2 = t.join().unwrap(); + assert_eq!(n1, n2); + } +} diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs new file mode 100644 index 00000000000..8027a27b394 --- /dev/null +++ b/components/resource_control/src/future.rs @@ -0,0 +1,46 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + future::Future, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use pin_project::pin_project; +use tikv_util::time::Instant; + +use crate::resource_group::{ResourceConsumeType, ResourceController}; + +#[pin_project] +pub struct ControlledFuture { + #[pin] + future: F, + controller: Arc, + group_name: Vec, +} + +impl ControlledFuture { + pub fn new(future: F, controller: Arc, group_name: Vec) -> Self { + Self { + future, + controller, + group_name, + } + } +} + +impl Future for ControlledFuture { + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + let now = Instant::now(); + let res = this.future.poll(cx); + this.controller.consume( + this.group_name, + ResourceConsumeType::CpuTime(now.saturating_elapsed()), + ); + res + } +} diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs new file mode 100644 index 00000000000..b186cb8a0c7 --- /dev/null +++ b/components/resource_control/src/lib.rs @@ -0,0 +1,36 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(test)] + +use online_config::OnlineConfig; +use serde::{Deserialize, Serialize}; + +mod resource_group; +pub use resource_group::{ + ResourceConsumeType, ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL, +}; + +mod future; +pub use future::ControlledFuture; + +#[cfg(test)] +extern crate test; + +mod service; +pub use service::ResourceManagerService; + +pub mod channel; +pub use channel::ResourceMetered; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct Config { + #[online_config(skip)] + pub enabled: bool, +} + +impl Default for Config { + fn default() -> Self { + Self { enabled: true } + } +} diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs new file mode 100644 index 00000000000..0a808811217 --- /dev/null +++ b/components/resource_control/src/resource_group.rs @@ -0,0 +1,840 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::Cell, + cmp::{max, min}, + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use collections::HashMap; +use dashmap::{mapref::one::Ref, DashMap}; +use fail::fail_point; +use kvproto::{ + kvrpcpb::{CommandPri, ResourceControlContext}, + resource_manager::{GroupMode, ResourceGroup}, +}; +use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; +use tikv_util::{info, time::Instant}; +use yatp::queue::priority::TaskPriorityProvider; + +// a read task cost at least 50us. +const DEFAULT_PRIORITY_PER_READ_TASK: u64 = 50; +// extra task schedule factor +const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; +/// duration to update the minimal priority value of each resource group. +pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); +/// default resource group name +const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; +/// default value of max RU quota. +const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; +/// The maximum RU quota that can be configured. +const MAX_RU_QUOTA: u64 = i32::MAX as u64; + +#[cfg(test)] +const LOW_PRIORITY: u32 = 1; +const MEDIUM_PRIORITY: u32 = 8; +#[cfg(test)] +const HIGH_PRIORITY: u32 = 16; + +// the global maxinum of virtual time is u64::MAX / 16, so when the virtual +// time of all groups are bigger than half of this value, we rest them to avoid +// virtual time overflow. +const RESET_VT_THRESHOLD: u64 = (u64::MAX >> 4) / 2; + +pub enum ResourceConsumeType { + CpuTime(Duration), + IoBytes(u64), +} + +/// ResourceGroupManager manages the metadata of each resource group. +#[derive(Default)] +pub struct ResourceGroupManager { + resource_groups: DashMap, + registry: RwLock>>, +} + +impl ResourceGroupManager { + fn get_ru_setting(rg: &ResourceGroup, is_read: bool) -> u64 { + match (rg.get_mode(), is_read) { + // RU mode, read and write use the same setting. + (GroupMode::RuMode, _) => rg + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + // TODO: currently we only consider the cpu usage in the read path, we may also take + // io read bytes into account later. + (GroupMode::RawMode, true) => rg + .get_raw_resource_settings() + .get_cpu() + .get_settings() + .get_fill_rate(), + (GroupMode::RawMode, false) => rg + .get_raw_resource_settings() + .get_io_write() + .get_settings() + .get_fill_rate(), + // return a default value for unsupported config. + (GroupMode::Unknown, _) => 1, + } + } + + pub fn add_resource_group(&self, rg: ResourceGroup) { + let group_name = rg.get_name().to_ascii_lowercase(); + self.registry.read().iter().for_each(|controller| { + let ru_quota = Self::get_ru_setting(&rg, controller.is_read); + controller.add_resource_group(group_name.clone().into_bytes(), ru_quota, rg.priority); + }); + info!("add resource group"; "name"=> &rg.name, "ru" => rg.get_r_u_settings().get_r_u().get_settings().get_fill_rate()); + self.resource_groups.insert(group_name, rg); + } + + pub fn remove_resource_group(&self, name: &str) { + let group_name = name.to_ascii_lowercase(); + self.registry.read().iter().for_each(|controller| { + controller.remove_resource_group(group_name.as_bytes()); + }); + info!("remove resource group"; "name"=> name); + self.resource_groups.remove(&group_name); + } + + pub fn retain(&self, mut f: impl FnMut(&String, &ResourceGroup) -> bool) { + let mut removed_names = vec![]; + self.resource_groups.retain(|k, v| { + let ret = f(k, v); + if !ret { + removed_names.push(k.clone()); + } + ret + }); + if !removed_names.is_empty() { + self.registry.read().iter().for_each(|controller| { + for name in &removed_names { + controller.remove_resource_group(name.as_bytes()); + } + }); + } + } + + pub fn get_resource_group(&self, name: &str) -> Option> { + self.resource_groups.get(&name.to_ascii_lowercase()) + } + + pub fn get_all_resource_groups(&self) -> Vec { + self.resource_groups.iter().map(|g| g.clone()).collect() + } + + pub fn derive_controller(&self, name: String, is_read: bool) -> Arc { + let controller = Arc::new(ResourceController::new(name, is_read)); + self.registry.write().push(controller.clone()); + for g in &self.resource_groups { + let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); + controller.add_resource_group(g.key().clone().into_bytes(), ru_quota, g.priority); + } + controller + } + + pub fn advance_min_virtual_time(&self) { + for controller in self.registry.read().iter() { + controller.update_min_virtual_time(); + } + } + + pub fn consume_penalty(&self, ctx: &ResourceControlContext) { + for controller in self.registry.read().iter() { + // FIXME: Should consume CPU time for read controller and write bytes for write + // controller, once CPU process time of scheduler worker is tracked. Currently, + // we consume write bytes for read controller as the + // order of magnitude of CPU time and write bytes is similar. + controller.consume( + ctx.resource_group_name.as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_nanos( + (ctx.get_penalty().total_cpu_time_ms * 1_000_000.0) as u64, + )), + ); + controller.consume( + ctx.resource_group_name.as_bytes(), + ResourceConsumeType::IoBytes(ctx.get_penalty().write_bytes as u64), + ); + } + } +} + +pub struct ResourceController { + // resource controller name is not used currently. + #[allow(dead_code)] + name: String, + // We handle the priority differently between read and write request: + // 1. the priority factor is calculate based on read/write RU settings. + // 2. for read request, we increase a constant virtual time delta at each `get_priority` call + // because the cost can't be calculated at start, so we only increase a constant delta and + // increase the real cost after task is executed; but don't increase it at write because + // the cost is known so we just pre-consume it. + is_read: bool, + // Track the maximum ru quota used to calculate the factor of each resource group. + // factor = max_ru_quota / group_ru_quota * 10.0 + // We use mutex here to ensure when we need to change this value and do adjust all resource + // groups' factors, it can't be changed concurrently. + // NOTE: becuase the ru config for "default" group is very large and it can cause very big + // group weight, we will not count this value by default. + max_ru_quota: Mutex, + // record consumption of each resource group, name --> resource_group + resource_consumptions: RwLock, GroupPriorityTracker>>, + // the latest min vt, this value is used to init new added group vt + last_min_vt: AtomicU64, + // the last time min vt is overflow + last_rest_vt_time: Cell, + // whether the settings is customized by user + customized: AtomicBool, +} + +// we are ensure to visit the `last_rest_vt_time` by only 1 thread so it's +// thread safe. +unsafe impl Send for ResourceController {} +unsafe impl Sync for ResourceController {} + +impl ResourceController { + pub fn new(name: String, is_read: bool) -> Self { + let controller = Self { + name, + is_read, + resource_consumptions: RwLock::new(HashMap::default()), + last_min_vt: AtomicU64::new(0), + max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), + last_rest_vt_time: Cell::new(Instant::now_coarse()), + customized: AtomicBool::new(false), + }; + // add the "default" resource group + controller.add_resource_group( + DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), + 0, + MEDIUM_PRIORITY, + ); + controller + } + + fn calculate_factor(max_quota: u64, quota: u64) -> u64 { + // we don't adjust the max_quota if it's the "default" group's default + // value(u32::MAX), so here it is possible that the quota is bigger than + // the max quota + if quota == 0 || quota > max_quota { + 1 + } else { + // we use max_quota / quota as the resource group factor, but because we need to + // cast the value to integer, so we times it by 10 to ensure the accuracy is + // enough. + let max_quota = min(max_quota * 10, MAX_RU_QUOTA); + (max_quota as f64 / quota as f64).round() as u64 + } + } + + fn add_resource_group(&self, name: Vec, mut ru_quota: u64, mut group_priority: u32) { + if group_priority == 0 { + // map 0 to medium priority(default priority) + group_priority = MEDIUM_PRIORITY; + } + if ru_quota > MAX_RU_QUOTA { + ru_quota = MAX_RU_QUOTA; + } + + let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); + // skip to adjust max ru if it is the "default" group and the ru config eq + // MAX_RU_QUOTA + if ru_quota > *max_ru_quota && (name != "default".as_bytes() || ru_quota < MAX_RU_QUOTA) { + *max_ru_quota = ru_quota; + // adjust all group weight because the current value is too small. + self.adjust_all_resource_group_factors(ru_quota); + } + let weight = Self::calculate_factor(*max_ru_quota, ru_quota); + + let vt_delta_for_get = if self.is_read { + DEFAULT_PRIORITY_PER_READ_TASK * weight + } else { + 0 + }; + let group = GroupPriorityTracker { + ru_quota, + group_priority, + weight, + virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), + vt_delta_for_get, + }; + + // maybe update existed group + self.resource_consumptions.write().insert(name, group); + self.check_customized(); + } + + fn check_customized(&self) { + let groups = self.resource_consumptions.read(); + if groups.len() == 1 && groups.get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()).is_some() { + self.customized.store(false, Ordering::Release); + return; + } + self.customized.store(true, Ordering::Release); + } + + // we calculate the weight of each resource group based on the currently maximum + // ru quota, if a incoming resource group has a bigger quota, we need to + // adjust all the existing groups. As we expect this won't happen very + // often, and iterate 10k entry cost less than 5ms, so the performance is + // acceptable. + fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { + self.resource_consumptions + .write() + .iter_mut() + .for_each(|(_, tracker)| { + tracker.weight = Self::calculate_factor(max_ru_quota, tracker.ru_quota); + }); + } + + fn remove_resource_group(&self, name: &[u8]) { + // do not remove the default resource group, reset to default setting instead. + if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { + self.add_resource_group( + DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), + 0, + MEDIUM_PRIORITY, + ); + self.check_customized(); + return; + } + self.resource_consumptions.write().remove(name); + self.check_customized(); + } + + pub fn is_customized(&self) -> bool { + self.customized.load(Ordering::Acquire) + } + + #[inline] + fn resource_group(&self, name: &[u8]) -> MappedRwLockReadGuard<'_, GroupPriorityTracker> { + let guard = self.resource_consumptions.read(); + RwLockReadGuard::map(guard, |m| { + if let Some(g) = m.get(name) { + g + } else { + m.get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()).unwrap() + } + }) + } + + pub fn consume(&self, name: &[u8], resource: ResourceConsumeType) { + self.resource_group(name).consume(resource) + } + + pub fn update_min_virtual_time(&self) { + let start = Instant::now_coarse(); + let mut min_vt = u64::MAX; + let mut max_vt = 0; + self.resource_consumptions + .read() + .iter() + .for_each(|(_, tracker)| { + let vt = tracker.current_vt(); + min_vt = min(min_vt, vt); + max_vt = max(max_vt, vt); + }); + + // TODO: use different threshold for different resource type + // needn't do update if the virtual different is less than 100ms/100KB. + if min_vt + 100_000 >= max_vt && max_vt < RESET_VT_THRESHOLD { + return; + } + + fail_point!("increase_vt_duration_update_min_vt"); + + let near_overflow = min_vt > RESET_VT_THRESHOLD; + self.resource_consumptions + .read() + .iter() + .for_each(|(_, tracker)| { + let vt = tracker.current_vt(); + // NOTE: this decrease vt is not atomic across all resource groups, + // but it should be ok as this operation should be extremely rare + // and the impact is not big. + if near_overflow { + tracker.decrease_vt(RESET_VT_THRESHOLD); + } else if vt < max_vt { + // TODO: is increase by half is a good choice. + tracker.increase_vt((max_vt - vt) / 2); + } + }); + if near_overflow { + let end = Instant::now_coarse(); + info!("all resource groups' virtual time are near overflow, do reset"; + "min" => min_vt, "max" => max_vt, "dur" => ?end.duration_since(start), + "reset_dur" => ?end.duration_since(self.last_rest_vt_time.get())); + max_vt -= RESET_VT_THRESHOLD; + self.last_rest_vt_time.set(end); + } + // max_vt is actually a little bigger than the current min vt, but we don't + // need totally accurate here. + self.last_min_vt.store(max_vt, Ordering::Relaxed); + } + + pub fn get_priority(&self, name: &[u8], pri: CommandPri) -> u64 { + let level = match pri { + CommandPri::Low => 2, + CommandPri::Normal => 1, + CommandPri::High => 0, + }; + self.resource_group(name).get_priority(level) + } +} + +impl TaskPriorityProvider for ResourceController { + fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { + self.resource_group(extras.metadata()) + .get_priority(extras.current_level() as usize) + } +} + +fn concat_priority_vt(group_priority: u32, vt: u64) -> u64 { + assert!((1..=16).contains(&group_priority)); + + // map group_priority from [1, 16] to [0, 15] to limit it 4 bits and get bitwise + // negation to replace leading 4 bits of vt. So that the priority is ordered in + // the descending order by group_priority first, then by vt in ascending order. + vt | (!((group_priority - 1) as u64) << 60) +} + +struct GroupPriorityTracker { + // the ru setting of this group. + ru_quota: u64, + group_priority: u32, + weight: u64, + virtual_time: AtomicU64, + // the constant delta value for each `get_priority` call, + vt_delta_for_get: u64, +} + +impl GroupPriorityTracker { + fn get_priority(&self, level: usize) -> u64 { + let task_extra_priority = TASK_EXTRA_FACTOR_BY_LEVEL[level] * 1000 * self.weight; + let vt = (if self.vt_delta_for_get > 0 { + self.virtual_time + .fetch_add(self.vt_delta_for_get, Ordering::Relaxed) + + self.vt_delta_for_get + } else { + self.virtual_time.load(Ordering::Relaxed) + }) + task_extra_priority; + concat_priority_vt(self.group_priority, vt) + } + + #[inline] + fn current_vt(&self) -> u64 { + self.virtual_time.load(Ordering::Relaxed) + } + + #[inline] + fn increase_vt(&self, vt_delta: u64) { + self.virtual_time.fetch_add(vt_delta, Ordering::Relaxed); + } + + #[inline] + fn decrease_vt(&self, vt_delta: u64) { + self.virtual_time.fetch_sub(vt_delta, Ordering::Relaxed); + } + + // TODO: make it delta type as generic to avoid mixed consume different types. + #[inline] + fn consume(&self, resource: ResourceConsumeType) { + let vt_delta = match resource { + ResourceConsumeType::CpuTime(dur) => dur.as_micros() as u64, + ResourceConsumeType::IoBytes(bytes) => bytes, + } * self.weight; + self.increase_vt(vt_delta); + } +} + +#[cfg(test)] +pub(crate) mod tests { + use rand::{thread_rng, RngCore}; + use yatp::queue::Extras; + + use super::*; + + pub fn new_resource_group_ru(name: String, ru: u64, group_priority: u32) -> ResourceGroup { + new_resource_group(name, true, ru, ru, group_priority) + } + + pub fn new_resource_group( + name: String, + is_ru_mode: bool, + read_tokens: u64, + write_tokens: u64, + group_priority: u32, + ) -> ResourceGroup { + use kvproto::resource_manager::{GroupRawResourceSettings, GroupRequestUnitSettings}; + + let mut group = ResourceGroup::new(); + group.set_name(name); + let mode = if is_ru_mode { + GroupMode::RuMode + } else { + GroupMode::RawMode + }; + group.set_mode(mode); + group.set_priority(group_priority); + if is_ru_mode { + assert!(read_tokens == write_tokens); + let mut ru_setting = GroupRequestUnitSettings::new(); + ru_setting + .mut_r_u() + .mut_settings() + .set_fill_rate(read_tokens); + group.set_r_u_settings(ru_setting); + } else { + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + } + group + } + + #[test] + fn test_resource_group() { + let resource_manager = ResourceGroupManager::default(); + + let group1 = new_resource_group_ru("TEST".into(), 100, 0); + resource_manager.add_resource_group(group1); + + assert!(resource_manager.get_resource_group("test1").is_none()); + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + 100 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group1 = new_resource_group_ru("Test".into(), 200, LOW_PRIORITY); + resource_manager.add_resource_group(group1); + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + 200 + ); + assert_eq!(group.value().get_priority(), 1); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group2 = new_resource_group_ru("test2".into(), 400, 0); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.resource_groups.len(), 2); + + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 3); + + let group1 = resource_ctl.resource_group("test".as_bytes()); + let group2 = resource_ctl.resource_group("test2".as_bytes()); + assert_eq!(group1.weight, group2.weight * 2); + assert_eq!(group1.current_vt(), 0); + + let mut extras1 = Extras::single_level(); + extras1.set_metadata("test".as_bytes().to_owned()); + assert_eq!( + resource_ctl.priority_of(&extras1), + concat_priority_vt(LOW_PRIORITY, group1.weight * 50) + ); + assert_eq!(group1.current_vt(), group1.weight * 50); + + let mut extras2 = Extras::single_level(); + extras2.set_metadata("test2".as_bytes().to_owned()); + assert_eq!( + resource_ctl.priority_of(&extras2), + concat_priority_vt(MEDIUM_PRIORITY, group2.weight * 50) + ); + assert_eq!(group2.current_vt(), group2.weight * 50); + + let mut extras3 = Extras::single_level(); + extras3.set_metadata("unknown_group".as_bytes().to_owned()); + assert_eq!( + resource_ctl.priority_of(&extras3), + concat_priority_vt(MEDIUM_PRIORITY, 50) + ); + assert_eq!( + resource_ctl + .resource_group("default".as_bytes()) + .current_vt(), + 50 + ); + + resource_ctl.consume( + "test".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + resource_ctl.consume( + "test2".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + + assert_eq!(group1.current_vt(), group1.weight * 10050); + assert_eq!(group1.current_vt(), group2.current_vt() * 2); + + // test update all group vts + resource_manager.advance_min_virtual_time(); + let group1_vt = group1.current_vt(); + let group1_weight = group1.weight; + assert_eq!(group1_vt, group1.weight * 10050); + assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); + assert!( + resource_ctl + .resource_group("default".as_bytes()) + .current_vt() + >= group1.current_vt() / 2 + ); + + drop(group1); + drop(group2); + + // test add 1 new resource group + let new_group = new_resource_group_ru("new_group".into(), 600, HIGH_PRIORITY); + resource_manager.add_resource_group(new_group); + + assert_eq!(resource_ctl.resource_consumptions.read().len(), 4); + let group3 = resource_ctl.resource_group("new_group".as_bytes()); + assert!(group1_weight - 10 <= group3.weight * 3 && group3.weight * 3 <= group1_weight + 10); + assert!(group3.current_vt() >= group1_vt / 2); + } + + #[test] + fn test_reset_resource_group_vt() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_write".into(), false); + + let group1 = new_resource_group_ru("g1".into(), i32::MAX as u64, 1); + resource_manager.add_resource_group(group1); + let group2 = new_resource_group_ru("g2".into(), 1, 16); + resource_manager.add_resource_group(group2); + + let g1 = resource_ctl.resource_group("g1".as_bytes()); + let g2 = resource_ctl.resource_group("g2".as_bytes()); + let threshold = 1 << 59; + let mut last_g2_vt = 0; + for i in 0..8 { + resource_ctl.consume("g2".as_bytes(), ResourceConsumeType::IoBytes(1 << 25)); + resource_manager.advance_min_virtual_time(); + if i < 7 { + assert!(g2.current_vt() < threshold); + } + // after 8 round, g1's vt still under the threshold and is still increasing. + assert!(g1.current_vt() < threshold && g1.current_vt() > last_g2_vt); + last_g2_vt = g2.current_vt(); + } + + resource_ctl.consume("g2".as_bytes(), ResourceConsumeType::IoBytes(1 << 25)); + resource_manager.advance_min_virtual_time(); + assert!(g1.current_vt() > threshold); + + // adjust again, the virtual time of each group should decrease + resource_manager.advance_min_virtual_time(); + let g1_vt = g1.current_vt(); + let g2_vt = g2.current_vt(); + assert!(g2_vt < threshold / 2); + assert!(g1_vt < threshold / 2 && g1_vt < g2_vt); + assert_eq!(resource_ctl.last_min_vt.load(Ordering::Relaxed), g2_vt); + } + + #[test] + fn test_adjust_resource_group_weight() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + assert_eq!(resource_ctl.is_customized(), false); + assert_eq!(resource_ctl_write.is_customized(), false); + let group1 = new_resource_group_ru("test1".into(), 5000, 0); + resource_manager.add_resource_group(group1); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 20 + ); + assert_eq!(resource_ctl.is_customized(), true); + assert_eq!(resource_ctl_write.is_customized(), true); + + // add a resource group with big ru + let group1 = new_resource_group_ru("test2".into(), 50000, 0); + resource_manager.add_resource_group(group1); + assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); + assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); + // resource_ctl_write should be unchanged. + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 50000); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write.resource_group("test2".as_bytes()).weight, + 10 + ); + + // add the default "default" group, the ru weight should not change. + // add a resource group with big ru + let group = new_resource_group_ru("default".into(), u32::MAX as u64, 0); + resource_manager.add_resource_group(group); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write + .resource_group("default".as_bytes()) + .weight, + 1 + ); + + // change the default group to another value, it can impact the ru then. + let group = new_resource_group_ru("default".into(), 100000, 0); + resource_manager.add_resource_group(group); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 200 + ); + assert_eq!( + resource_ctl_write + .resource_group("default".as_bytes()) + .weight, + 10 + ); + } + + #[test] + fn test_reset_resource_group_vt_overflow() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_write".into(), false); + let mut rng = thread_rng(); + + let mut min_delta = u64::MAX; + let mut max_delta = 0; + for i in 0..10 { + let name = format!("g{}", i); + let g = new_resource_group_ru(name.clone(), 100, 1); + resource_manager.add_resource_group(g); + let delta = rng.next_u64() % 10000 + 1; + min_delta = delta.min(min_delta); + max_delta = delta.max(max_delta); + resource_ctl + .resource_group(name.as_bytes()) + .increase_vt(RESET_VT_THRESHOLD + delta); + } + resource_ctl + .resource_group("default".as_bytes()) + .increase_vt(RESET_VT_THRESHOLD + 1); + + let old_max_vt = resource_ctl + .resource_consumptions + .read() + .iter() + .fold(0, |v, (_, g)| v.max(g.current_vt())); + let resource_ctl_cloned = resource_ctl.clone(); + fail::cfg_callback("increase_vt_duration_update_min_vt", move || { + resource_ctl_cloned + .resource_consumptions + .read() + .iter() + .enumerate() + .for_each(|(i, (_, tracker))| { + if i % 2 == 0 { + tracker.increase_vt(max_delta - min_delta); + } + }); + }) + .unwrap(); + resource_ctl.update_min_virtual_time(); + fail::remove("increase_vt_duration_update_min_vt"); + + let new_max_vt = resource_ctl + .resource_consumptions + .read() + .iter() + .fold(0, |v, (_, g)| v.max(g.current_vt())); + // check all vt has decreased by RESET_VT_THRESHOLD. + assert!(new_max_vt < max_delta * 2); + // check fail-point takes effect, the `new_max_vt` has increased. + assert!(old_max_vt - RESET_VT_THRESHOLD < new_max_vt); + } + + #[test] + fn test_retain_resource_groups() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + + for i in 0..5 { + let group1 = new_resource_group_ru(format!("test{}", i), 100, 0); + resource_manager.add_resource_group(group1); + // add a resource group with big ru + let group1 = new_resource_group_ru(format!("group{}", i), 100, 0); + resource_manager.add_resource_group(group1); + } + // consume for default group + resource_ctl.consume( + b"default", + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + resource_ctl_write.consume(b"default", ResourceConsumeType::IoBytes(10000)); + + assert_eq!(resource_manager.get_all_resource_groups().len(), 10); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 11); // 10 + 1(default) + assert_eq!(resource_ctl_write.resource_consumptions.read().len(), 11); + + resource_manager.retain(|k, _v| k.starts_with("test")); + assert_eq!(resource_manager.get_all_resource_groups().len(), 5); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 6); + assert_eq!(resource_ctl_write.resource_consumptions.read().len(), 6); + assert!(resource_manager.get_resource_group("group1").is_none()); + // should use the virtual time of default group for non-exist group + assert_ne!( + resource_ctl + .resource_group("group2".as_bytes()) + .current_vt(), + 0 + ); + assert_ne!( + resource_ctl_write + .resource_group("group2".as_bytes()) + .current_vt(), + 0 + ); + } + + #[test] + fn test_concat_priority_vt() { + let v1 = concat_priority_vt(MEDIUM_PRIORITY, 1000); + let v2 = concat_priority_vt(MEDIUM_PRIORITY, 1111); + assert!(v1 < v2); + + let v3 = concat_priority_vt(LOW_PRIORITY, 1000); + assert!(v1 < v3); + + let v4 = concat_priority_vt(MEDIUM_PRIORITY, 1111); + assert_eq!(v2, v4); + + let v5 = concat_priority_vt(HIGH_PRIORITY, 10); + assert!(v5 < v1); + } +} diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs new file mode 100644 index 00000000000..82c01eae398 --- /dev/null +++ b/components/resource_control/src/service.rs @@ -0,0 +1,306 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashSet, sync::Arc, time::Duration}; + +use futures::{compat::Future01CompatExt, StreamExt}; +use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; +use pd_client::{Error as PdError, PdClient, RpcClient, RESOURCE_CONTROL_CONFIG_PATH}; +use tikv_util::{error, timer::GLOBAL_TIMER_HANDLE}; + +use crate::ResourceGroupManager; + +#[derive(Clone)] +pub struct ResourceManagerService { + manager: Arc, + pd_client: Arc, + // record watch revision + revision: i64, +} + +impl ResourceManagerService { + /// Constructs a new `Service` with `ResourceGroupManager` and a `RpcClient` + pub fn new( + manager: Arc, + pd_client: Arc, + ) -> ResourceManagerService { + ResourceManagerService { + pd_client, + manager, + revision: 0, + } + } +} + +const RETRY_INTERVAL: Duration = Duration::from_secs(1); // to consistent with pd_client + +impl ResourceManagerService { + pub async fn watch_resource_groups(&mut self) { + 'outer: loop { + // Firstly, load all resource groups as of now. + self.reload_all_resource_groups().await; + // Secondly, start watcher at loading revision. + loop { + match self + .pd_client + .watch_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), self.revision) + { + Ok(mut stream) => { + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + self.revision = r.get_revision(); + r.get_changes() + .iter() + .for_each(|item| match item.get_kind() { + EventType::Put => { + match protobuf::parse_from_bytes::( + item.get_payload(), + ) { + Ok(group) => { + self.manager.add_resource_group(group); + } + Err(e) => { + error!("parse put resource group event failed"; "name" => item.get_name(), "err" => ?e); + } + } + } + EventType::Delete => { + match protobuf::parse_from_bytes::( + item.get_payload(), + ) { + Ok(group) => { + self.manager.remove_resource_group(group.get_name()); + } + Err(e) => { + error!("parse delete resource group event failed"; "name" => item.get_name(), "err" => ?e); + } + } + } + }); + } + Err(err) => { + error!("failed to get stream"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } + } + } + } + Err(PdError::DataCompacted(msg)) => { + error!("required revision has been compacted"; "err" => ?msg); + continue 'outer; + } + Err(err) => { + error!("failed to watch resource groups"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } + } + } + } + } + + async fn reload_all_resource_groups(&mut self) { + loop { + match self + .pd_client + .load_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string()) + .await + { + Ok((items, revision)) => { + let mut vaild_groups = HashSet::with_capacity(items.len()); + items.iter().for_each(|g| { + match protobuf::parse_from_bytes::(g.get_payload()) { + Ok(rg) => { + vaild_groups.insert(rg.get_name().to_ascii_lowercase()); + self.manager.add_resource_group(rg); + } + Err(e) => { + error!("parse resource group failed"; "name" => g.get_name(), "err" => ?e); + } + } + }); + + self.manager.retain(|name, _g| vaild_groups.contains(name)); + self.revision = revision; + return; + } + Err(err) => { + error!("failed to load global config"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } + } + } + } +} + +#[cfg(test)] +pub mod tests { + use std::time::Duration; + + use futures::executor::block_on; + use kvproto::pdpb::GlobalConfigItem; + use pd_client::RpcClient; + use protobuf::Message; + use test_pd::{mocker::Service, util::*, Server as MockServer}; + use tikv_util::{config::ReadableDuration, worker::Builder}; + + use crate::resource_group::tests::{new_resource_group, new_resource_group_ru}; + + fn new_test_server_and_client( + update_interval: ReadableDuration, + ) -> (MockServer, RpcClient) { + let server = MockServer::new(1); + let eps = server.bind_addrs(); + let client = new_client_with_update_interval(eps, None, update_interval); + (server, client) + } + + fn add_resource_group(pd_client: Arc, group: ResourceGroup) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Put); + item.set_name(group.get_name().to_string()); + let mut buf = Vec::new(); + group.write_to_vec(&mut buf).unwrap(); + item.set_payload(buf); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + fn delete_resource_group(pd_client: Arc, name: &str) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Delete); + item.set_name(name.to_string()); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + use super::*; + #[test] + fn crud_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let group = new_resource_group("TEST".into(), true, 100, 100, 0); + add_resource_group(s.pd_client.clone(), group); + block_on(s.reload_all_resource_groups()); + assert_eq!(s.manager.get_all_resource_groups().len(), 1); + assert_eq!(s.revision, 1); + + delete_resource_group(s.pd_client.clone(), "TEST"); + block_on(s.reload_all_resource_groups()); + assert_eq!(s.manager.get_all_resource_groups().len(), 0); + assert_eq!(s.revision, 2); + + server.stop(); + } + + #[test] + fn watch_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + block_on(s.reload_all_resource_groups()); + assert_eq!(s.manager.get_all_resource_groups().len(), 0); + assert_eq!(s.revision, 0); + + // TODO: find a better way to observe the watch is ready. + let wait_watch_ready = |s: &ResourceManagerService, count: usize| { + for _i in 0..100 { + if s.manager.get_all_resource_groups().len() == count { + return; + } + std::thread::sleep(Duration::from_millis(1)); + } + panic!( + "wait time out, expectd: {}, got: {}", + count, + s.manager.get_all_resource_groups().len() + ); + }; + + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group_ru("TEST1".into(), 100, 0); + add_resource_group(s.pd_client.clone(), group1); + let group2 = new_resource_group_ru("TEST2".into(), 100, 0); + add_resource_group(s.pd_client.clone(), group2); + // Mock modify + let group2 = new_resource_group_ru("TEST2".into(), 50, 0); + add_resource_group(s.pd_client.clone(), group2); + wait_watch_ready(&s, 2); + + // Mock delete + delete_resource_group(s.pd_client.clone(), "TEST1"); + + // Wait for watcher + wait_watch_ready(&s, 1); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 1); + assert!(s.manager.get_resource_group("TEST1").is_none()); + let group = s.manager.get_resource_group("TEST2").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + 50 + ); + server.stop(); + } + + #[test] + fn reboot_watch_server_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group_ru("TEST1".into(), 100, 0); + add_resource_group(s.pd_client.clone(), group1); + // Mock reboot watch server + let watch_global_config_fp = "watch_global_config_return"; + fail::cfg(watch_global_config_fp, "return").unwrap(); + std::thread::sleep(Duration::from_millis(100)); + fail::remove(watch_global_config_fp); + // Mock add after rebooting will success + let group1 = new_resource_group_ru("TEST2".into(), 100, 0); + add_resource_group(s.pd_client.clone(), group1); + // Wait watcher update + std::thread::sleep(Duration::from_secs(1)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 2); + + server.stop(); + } +} diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index cecaa3c911b..f8e26e01c50 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -4,30 +4,27 @@ version = "0.0.1" edition = "2018" [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } crossbeam = "0.8" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -online_config = { path = "../online_config" } +online_config = { workspace = true } pdqselect = "0.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util" } +slog = { workspace = true } +slog-global = { workspace = true } +tikv_util = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } -[target.'cfg(not(target_os = "linux"))'.dependencies] -thread-id = "4" - [dev-dependencies] rand = "0.8" diff --git a/components/resource_metering/src/collector.rs b/components/resource_metering/src/collector.rs index 9e1830b8acb..bdadd638f2e 100644 --- a/components/resource_metering/src/collector.rs +++ b/components/resource_metering/src/collector.rs @@ -15,7 +15,8 @@ use crate::RawRecords; /// to the `Scheduler` for processing. /// /// `Reporter` implements [Runnable] and [RunnableWithTimer], aggregates the -/// data sent by the `Collector` internally, and reports it regularly through RPC. +/// data sent by the `Collector` internally, and reports it regularly through +/// RPC. /// /// [Recorder]: crate::recorder::Recorder /// [Reporter]: crate::reporter::Reporter diff --git a/components/resource_metering/src/config.rs b/components/resource_metering/src/config.rs index ae28536f10e..090768a9493 100644 --- a/components/resource_metering/src/config.rs +++ b/components/resource_metering/src/config.rs @@ -110,7 +110,7 @@ impl ConfigManager { impl online_config::ConfigManager for ConfigManager { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { let mut new_config = self.current_config.clone(); - new_config.update(change); + new_config.update(change)?; new_config.validate()?; if self.current_config.receiver_address != new_config.receiver_address { self.address_notifier @@ -133,34 +133,34 @@ mod tests { #[test] fn test_config_validate() { let cfg = Config::default(); - assert!(cfg.validate().is_ok()); // Empty address is allowed. + cfg.validate().unwrap(); // Empty address is allowed. let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::minutes(1), max_resource_groups: 2000, precision: ReadableDuration::secs(1), }; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::days(999), // invalid max_resource_groups: 2000, precision: ReadableDuration::secs(1), }; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::minutes(1), max_resource_groups: usize::MAX, // invalid precision: ReadableDuration::secs(1), }; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::minutes(1), max_resource_groups: 2000, precision: ReadableDuration::days(999), // invalid }; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } } diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index 9c1f25e4b0c..ba8e2174e19 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -51,9 +51,9 @@ pub const MAX_THREAD_REGISTER_RETRY: u32 = 10; /// This structure is used as a label to distinguish different request contexts. /// -/// In order to associate `ResourceMeteringTag` with a certain piece of code logic, -/// we added a function to [Future] to bind `ResourceMeteringTag` to the specified -/// future context. It is used in the main business logic of TiKV. +/// In order to associate `ResourceMeteringTag` with a certain piece of code +/// logic, we added a function to [Future] to bind `ResourceMeteringTag` to the +/// specified future context. It is used in the main business logic of TiKV. /// /// [Future]: futures::Future pub struct ResourceMeteringTag { @@ -143,15 +143,12 @@ impl Drop for Guard { return; } let mut records = ls.summary_records.lock().unwrap(); - match records.get(&tag) { - Some(record) => { - record.merge(&cur_record); - } - None => { - // See MAX_SUMMARY_RECORDS_LEN. - if records.len() < MAX_SUMMARY_RECORDS_LEN { - records.insert(tag, cur_record); - } + if let Some(record) = records.get(&tag) { + record.merge(&cur_record); + } else { + // See MAX_SUMMARY_RECORDS_LEN. + if records.len() < MAX_SUMMARY_RECORDS_LEN { + records.insert(tag, cur_record); } } }) @@ -214,14 +211,15 @@ impl ResourceTagFactory { /// This trait extends the standard [Future]. /// -/// When the user imports [FutureExt], all futures in its module (such as async block) -/// will additionally support the [FutureExt::in_resource_metering_tag] method. This method -/// can bind a [ResourceMeteringTag] to the scope of this future (actually, it is stored in -/// the local storage of the thread where `Future` is located). During the polling period of -/// the future, we can continue to observe the system resources used by the thread in which -/// it is located, which is associated with `ResourceMeteringTag` and is also stored in thread -/// local storage. There is a background thread that continuously summarizes the storage of -/// each thread and reports it regularly. +/// When the user imports [FutureExt], all futures in its module (such as async +/// block) will additionally support the [FutureExt::in_resource_metering_tag] +/// method. This method can bind a [ResourceMeteringTag] to the scope of this +/// future (actually, it is stored in the local storage of the thread where +/// `Future` is located). During the polling period of the future, we can +/// continue to observe the system resources used by the thread in which it is +/// located, which is associated with `ResourceMeteringTag` and is also stored +/// in thread local storage. There is a background thread that continuously +/// summarizes the storage of each thread and reports it regularly. /// /// [Future]: futures::Future pub trait FutureExt: Sized { @@ -245,8 +243,9 @@ pub trait StreamExt: Sized { impl StreamExt for T {} -/// This structure is the return value of the [FutureExt::in_resource_metering_tag] method, -/// which wraps the original [Future] with a [ResourceMeteringTag]. +/// This structure is the return value of the +/// [FutureExt::in_resource_metering_tag] method, which wraps the original +/// [Future] with a [ResourceMeteringTag]. /// /// see [FutureExt] for more information. /// diff --git a/components/resource_metering/src/model.rs b/components/resource_metering/src/model.rs index 0cacc6930d4..6f7118ef9e1 100644 --- a/components/resource_metering/src/model.rs +++ b/components/resource_metering/src/model.rs @@ -20,7 +20,7 @@ thread_local! { } /// Raw resource statistics record. -#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct RawRecord { pub cpu_time: u32, // ms pub read_keys: u32, @@ -48,7 +48,7 @@ impl RawRecord { /// [Recorder]: crate::recorder::Recorder /// [Reporter]: crate::reporter::Reporter /// [Collector]: crate::collector::Collector -#[derive(Debug, Eq, PartialEq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct RawRecords { pub begin_unix_time_secs: u64, pub duration: Duration, @@ -71,7 +71,8 @@ impl Default for RawRecords { } impl RawRecords { - /// Keep a maximum of `k` self.records and aggregate the others into returned [RawRecord]. + /// Keep a maximum of `k` self.records and aggregate the others into + /// returned [RawRecord]. pub fn keep_top_k(&mut self, k: usize) -> RawRecord { let mut others = RawRecord::default(); if self.records.len() <= k { diff --git a/components/resource_metering/src/recorder/collector_reg.rs b/components/resource_metering/src/recorder/collector_reg.rs index 8205a2290cb..f166101dfe5 100644 --- a/components/resource_metering/src/recorder/collector_reg.rs +++ b/components/resource_metering/src/recorder/collector_reg.rs @@ -30,16 +30,16 @@ impl CollectorRegHandle { } } - /// Register a collector to the recorder. Dropping the returned [CollectorGuard] will - /// preform deregistering. + /// Register a collector to the recorder. Dropping the returned + /// [CollectorGuard] will preform deregistering. /// - /// The second argument `as_observer` indicates that whether the given `collector` will - /// control the enabled state of the recorder: - /// - When `as_observer` is false, the recorder will respect it and begin to profile if it's - /// off before. In other words, if there is at least one non-observed collector, the recorder - /// will keep running. - /// - When `as_observer` is true, whether the recorder to be on or off won't depend on if - /// the collector exists. + /// The second argument `as_observer` indicates that whether the given + /// `collector` will control the enabled state of the recorder: + /// - When `as_observer` is false, the recorder will respect it and begin to + /// profile if it's off before. In other words, if there is at least one + /// non-observed collector, the recorder will keep running. + /// - When `as_observer` is true, whether the recorder to be on or off won't + /// depend on if the collector exists. pub fn register(&self, collector: Box, as_observer: bool) -> CollectorGuard { static NEXT_COLLECTOR_ID: AtomicU64 = AtomicU64::new(1); let id = CollectorId(NEXT_COLLECTOR_ID.fetch_add(1, Ordering::SeqCst)); diff --git a/components/resource_metering/src/recorder/localstorage.rs b/components/resource_metering/src/recorder/localstorage.rs index afc9554a212..c9f0b25b478 100644 --- a/components/resource_metering/src/recorder/localstorage.rs +++ b/components/resource_metering/src/recorder/localstorage.rs @@ -16,10 +16,11 @@ thread_local! { pub static STORAGE: RefCell = RefCell::new(LocalStorage::default()); } -/// `LocalStorage` is a thread-local structure that contains all necessary data of submodules. +/// `LocalStorage` is a thread-local structure that contains all necessary data +/// of submodules. /// -/// In order to facilitate mutual reference, the thread-local data of all sub-modules -/// need to be stored centrally in `LocalStorage`. +/// In order to facilitate mutual reference, the thread-local data of all +/// sub-modules need to be stored centrally in `LocalStorage`. #[derive(Clone, Default)] pub struct LocalStorage { pub registered: bool, diff --git a/components/resource_metering/src/recorder/mod.rs b/components/resource_metering/src/recorder/mod.rs index 92e6d094274..f0b2e88ee4e 100644 --- a/components/resource_metering/src/recorder/mod.rs +++ b/components/resource_metering/src/recorder/mod.rs @@ -288,8 +288,9 @@ impl ConfigChangeNotifier { } } -/// Constructs a default [Recorder], spawn it and return the corresponding [ConfigChangeNotifier], -/// [CollectorRegHandle], [ResourceTagFactory] and [LazyWorker]. +/// Constructs a default [Recorder], spawn it and return the corresponding +/// [ConfigChangeNotifier], [CollectorRegHandle], [ResourceTagFactory] and +/// [LazyWorker]. /// /// This function is intended to simplify external use. pub fn init_recorder( @@ -302,8 +303,8 @@ pub fn init_recorder( ) { let recorder = RecorderBuilder::default() .precision_ms(precision_ms) - .add_sub_recorder(Box::new(CpuRecorder::default())) - .add_sub_recorder(Box::new(SummaryRecorder::default())) + .add_sub_recorder(Box::::default()) + .add_sub_recorder(Box::::default()) .build(); let mut recorder_worker = WorkerBuilder::new("resource-metering-recorder") .pending_capacity(256) diff --git a/components/resource_metering/src/recorder/sub_recorder/mod.rs b/components/resource_metering/src/recorder/sub_recorder/mod.rs index e36acb26ddb..42647f3486d 100644 --- a/components/resource_metering/src/recorder/sub_recorder/mod.rs +++ b/components/resource_metering/src/recorder/sub_recorder/mod.rs @@ -8,19 +8,22 @@ use crate::{recorder::localstorage::LocalStorage, RawRecords}; pub mod cpu; pub mod summary; -/// This trait defines a general framework that works at a certain frequency. Typically, -/// it describes the recorder(sampler) framework for a specific resource. +/// This trait defines a general framework that works at a certain frequency. +/// Typically, it describes the recorder(sampler) framework for a specific +/// resource. /// -/// [Recorder] will maintain a list of sub-recorders, driving all sub-recorders to work -/// according to the behavior described in this trait. +/// [Recorder] will maintain a list of sub-recorders, driving all sub-recorders +/// to work according to the behavior described in this trait. pub trait SubRecorder: Send { - /// This function is called at a fixed frequency. (A typical frequency is 99hz.) + /// This function is called at a fixed frequency. (A typical frequency is + /// 99hz.) /// - /// The [RawRecords] and [LocalStorage] map of all threads will be passed in through - /// parameters. We need to collect resources (may be from each `LocalStorage`) and - /// write them into `RawRecords`. + /// The [RawRecords] and [LocalStorage] map of all threads will be passed in + /// through parameters. We need to collect resources (may be from each + /// `LocalStorage`) and write them into `RawRecords`. /// - /// The implementation needs to sample the resource in this function (in general). + /// The implementation needs to sample the resource in this function (in + /// general). /// /// [RawRecords]: crate::model::RawRecords /// [LocalStorage]: crate::localstorage::LocalStorage @@ -30,8 +33,8 @@ pub trait SubRecorder: Send { /// This function is called every time before reporting to Collector. /// The default period is 1 second. /// - /// The [RawRecords] and [LocalStorage] map of all threads will be passed in through parameters. - /// `usize` is thread_id without platform dependency. + /// The [RawRecords] and [LocalStorage] map of all threads will be passed in + /// through parameters. `usize` is thread_id without platform dependency. /// /// [RawRecords]: crate::model::RawRecords /// [LocalStorage]: crate::localstorage::LocalStorage diff --git a/components/resource_metering/src/recorder/sub_recorder/summary.rs b/components/resource_metering/src/recorder/sub_recorder/summary.rs index 34cf07f9caf..93ba95080e3 100644 --- a/components/resource_metering/src/recorder/sub_recorder/summary.rs +++ b/components/resource_metering/src/recorder/sub_recorder/summary.rs @@ -35,8 +35,9 @@ pub fn record_write_keys(count: u32) { /// An implementation of [SubRecorder] for collecting summary data. /// -/// `SummaryRecorder` uses some special methods ([record_read_keys]/[record_write_keys]) -/// to collect external statistical information. +/// `SummaryRecorder` uses some special methods +/// ([record_read_keys]/[record_write_keys]) to collect external statistical +/// information. /// /// See [SubRecorder] for more relevant designs. /// @@ -59,7 +60,8 @@ impl SubRecorder for SummaryRecorder { } // The request currently being polled has not yet been merged into the hashmap, // so it needs to be processed separately. (For example, a slow request that is - // blocking needs to reflect in real time how many keys have been read currently) + // blocking needs to reflect in real time how many keys have been read + // currently) if let Some(t) = ls.attached_tag.load_full() { if t.extra_attachment.is_empty() { return; diff --git a/components/resource_metering/src/reporter/data_sink.rs b/components/resource_metering/src/reporter/data_sink.rs index 1dadc2723bc..e453bdd3371 100644 --- a/components/resource_metering/src/reporter/data_sink.rs +++ b/components/resource_metering/src/reporter/data_sink.rs @@ -9,7 +9,8 @@ use crate::error::Result; /// This trait abstracts the interface to communicate with the remote. /// We can simply mock this interface to test without RPC. pub trait DataSink: Send { - // `try_send` pushes a report data into the sink, which will later be sent to a target - // by the sink. If the sink is kept full, or the sink is closed, an error will be returned. + // `try_send` pushes a report data into the sink, which will later be sent to a + // target by the sink. If the sink is kept full, or the sink is closed, an error + // will be returned. fn try_send(&mut self, records: Arc>) -> Result<()>; } diff --git a/components/resource_metering/src/reporter/mod.rs b/components/resource_metering/src/reporter/mod.rs index 024a79bde53..721fb570b22 100644 --- a/components/resource_metering/src/reporter/mod.rs +++ b/components/resource_metering/src/reporter/mod.rs @@ -30,9 +30,9 @@ use crate::{ /// A structure for reporting statistics through [Client]. /// -/// `Reporter` implements [Runnable] and [RunnableWithTimer] to handle [Task]s from -/// the [Scheduler]. It internally aggregates the reported [RawRecords] into [Records] -/// and upload them to the remote server through the `Client`. +/// `Reporter` implements [Runnable] and [RunnableWithTimer] to handle [Task]s +/// from the [Scheduler]. It internally aggregates the reported [RawRecords] +/// into [Records] and upload them to the remote server through the `Client`. /// /// [Runnable]: tikv_util::worker::Runnable /// [RunnableWithTimer]: tikv_util::worker::RunnableWithTimer @@ -205,7 +205,8 @@ impl ConfigChangeNotifier { } } -/// Constructs a default [Recorder], start it and return the corresponding [ConfigChangeNotifier], [DataSinkRegHandle] and [LazyWorker]. +/// Constructs a default [Recorder], start it and return the corresponding +/// [ConfigChangeNotifier], [DataSinkRegHandle] and [LazyWorker]. /// /// This function is intended to simplify external use. pub fn init_reporter( diff --git a/components/resource_metering/src/reporter/pubsub.rs b/components/resource_metering/src/reporter/pubsub.rs index 0112a8b17db..62144ec920c 100644 --- a/components/resource_metering/src/reporter/pubsub.rs +++ b/components/resource_metering/src/reporter/pubsub.rs @@ -22,8 +22,9 @@ use crate::{ /// `PubSubService` implements [ResourceMeteringPubSub]. /// -/// If a client subscribes to resource metering records, the `PubSubService` is responsible for -/// registering them to the reporter. Then the reporter sends data to the client periodically. +/// If a client subscribes to resource metering records, the `PubSubService` is +/// responsible for registering them to the reporter. Then the reporter sends +/// data to the client periodically. /// /// [ResourceMeteringPubSub]: kvproto::resource_usage_agent_grpc::ResourceMeteringPubSub #[derive(Clone)] diff --git a/components/resource_metering/src/reporter/single_target.rs b/components/resource_metering/src/reporter/single_target.rs index 69817bc847b..09609b84462 100644 --- a/components/resource_metering/src/reporter/single_target.rs +++ b/components/resource_metering/src/reporter/single_target.rs @@ -41,8 +41,8 @@ impl Runnable for SingleTargetDataSink { } } -/// `SingleTargetDataSink` is the default implementation of [DataSink], which uses gRPC -/// to report data to the remote end. +/// `SingleTargetDataSink` is the default implementation of [DataSink], which +/// uses gRPC to report data to the remote end. pub struct SingleTargetDataSink { scheduler: Scheduler, data_sink_reg: DataSinkRegHandle, @@ -246,8 +246,8 @@ impl Drop for Guard { } } -/// Constructs a default [SingleTargetDataSink], start it and return the corresponding [AddressChangeNotifier] -/// and [LazyWorker]. +/// Constructs a default [SingleTargetDataSink], start it and return the +/// corresponding [AddressChangeNotifier] and [LazyWorker]. /// /// This function is intended to simplify external use. pub fn init_single_target( diff --git a/components/resource_metering/tests/summary_test.rs b/components/resource_metering/tests/summary_test.rs index c5a9ae61ac3..ae647055206 100644 --- a/components/resource_metering/tests/summary_test.rs +++ b/components/resource_metering/tests/summary_test.rs @@ -53,7 +53,7 @@ fn test_summary() { let data_sink = MockDataSink::default(); - /* At this point we are ready for everything except turning on the switch. */ + // At this point we are ready for everything except turning on the switch. // expect no data { diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index 2b498bc0965..fdf7ab8e29e 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -5,14 +5,14 @@ edition = "2018" publish = false [dependencies] -collections = { path = "../collections" } -encryption = { path = "../encryption", default-features = false } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +collections = { workspace = true } +encryption = { workspace = true } +grpcio = { workspace = true } +kvproto = { workspace = true } serde = "1.0" serde_derive = "1.0" serde_json = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } -tonic = "0.5" +tikv_util = { workspace = true } [dev-dependencies] tempfile = "3.0" diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index ec6cf0e6df2..bbd296ae1f7 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -18,7 +18,6 @@ use grpcio::{ RpcContext, RpcStatus, RpcStatusCode, ServerBuilder, ServerChecker, ServerCredentialsBuilder, ServerCredentialsFetcher, }; -use tonic::transport::{channel::ClientTlsConfig, Certificate, Identity}; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)] #[serde(default)] @@ -40,7 +39,8 @@ pub struct SecurityConfig { /// /// # Arguments /// -/// - `tag`: only used in the error message, like "ca key", "cert key", "private key", etc. +/// - `tag`: only used in the error message, like "ca key", "cert key", +/// "private key", etc. fn check_key_file(tag: &str, path: &str) -> Result, Box> { if path.is_empty() { return Ok(None); @@ -68,9 +68,26 @@ fn load_key(tag: &str, path: &str) -> Result, Box> { type CertResult = Result<(Vec, Vec, Vec), Box>; +type Pem = Box<[u8]>; + +pub struct Secret(pub Pem); + +impl std::fmt::Debug for Secret { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Secret").finish() + } +} + +#[derive(Debug)] +pub struct ClientSuite { + pub ca: Pem, + pub client_cert: Pem, + pub client_key: Secret, +} + impl SecurityConfig { /// Validates ca, cert and private key. - pub fn validate(&self) -> Result<(), Box> { + pub fn validate(&self, raftstore_v2: bool) -> Result<(), Box> { check_key_file("ca key", &self.ca_path)?; check_key_file("cert key", &self.cert_path)?; check_key_file("private key", &self.key_path)?; @@ -80,6 +97,12 @@ impl SecurityConfig { { return Err("ca, cert and private key should be all configured.".into()); } + if raftstore_v2 + && self.encryption.data_encryption_method + != kvproto::encryptionpb::EncryptionMethod::Plaintext + { + return Err("encryption is not supported for partitioned-raft-kv".into()); + } Ok(()) } @@ -122,20 +145,13 @@ impl SecurityManager { }) } - /// Make a tonic tls config via the config. - pub fn tonic_tls_config(&self) -> Option { - let (ca, cert, key) = self.cfg.load_certs().unwrap_or_default(); - if ca.is_empty() && cert.is_empty() && key.is_empty() { - return None; - } - let mut cfg = ClientTlsConfig::new(); - if !ca.is_empty() { - cfg = cfg.ca_certificate(Certificate::from_pem(ca)); - } - if !cert.is_empty() && !key.is_empty() { - cfg = cfg.identity(Identity::from_pem(cert, key)); - } - Some(cfg) + pub fn client_suite(&self) -> Result> { + let (ca, cert, key) = self.cfg.load_certs()?; + Ok(ClientSuite { + ca: ca.into_boxed_slice(), + client_cert: cert.into_boxed_slice(), + client_key: Secret(key.into_boxed_slice()), + }) } pub fn connect(&self, mut cb: ChannelBuilder, addr: &str) -> Channel { @@ -163,7 +179,7 @@ impl SecurityManager { sb.bind(addr, port) } else { if !self.cfg.cert_allowed_cn.is_empty() { - let cn_checker = CNChecker { + let cn_checker = CnChecker { allowed_cn: Arc::new(self.cfg.cert_allowed_cn.clone()), }; sb = sb.add_checker(cn_checker); @@ -180,14 +196,18 @@ impl SecurityManager { ) } } + + pub fn get_config(&self) -> &SecurityConfig { + &self.cfg + } } #[derive(Clone)] -struct CNChecker { +struct CnChecker { allowed_cn: Arc>, } -impl ServerChecker for CNChecker { +impl ServerChecker for CnChecker { fn check(&mut self, ctx: &RpcContext<'_>) -> CheckResult { match check_common_name(&self.allowed_cn, ctx) { Ok(()) => CheckResult::Continue, @@ -284,7 +304,7 @@ mod tests { fn test_security() { let cfg = SecurityConfig::default(); // default is disable secure connection. - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); let mgr = SecurityManager::new(&cfg).unwrap(); assert!(mgr.cfg.ca_path.is_empty()); assert!(mgr.cfg.cert_path.is_empty()); @@ -293,7 +313,7 @@ mod tests { let assert_cfg = |c: fn(&mut SecurityConfig), valid: bool| { let mut invalid_cfg = cfg.clone(); c(&mut invalid_cfg); - assert_eq!(invalid_cfg.validate().is_ok(), valid); + assert_eq!(invalid_cfg.validate(false).is_ok(), valid); }; // invalid path should be rejected. @@ -314,18 +334,18 @@ mod tests { .iter() .enumerate() { - fs::write(f, &[id as u8]).unwrap(); + fs::write(f, [id as u8]).unwrap(); } let mut c = cfg.clone(); c.cert_path = format!("{}", example_cert.display()); c.key_path = format!("{}", example_key.display()); // incomplete configuration. - c.validate().unwrap_err(); + c.validate(false).unwrap_err(); // data should be loaded from file after validating. c.ca_path = format!("{}", example_ca.display()); - c.validate().unwrap(); + c.validate(false).unwrap(); let (ca, cert, key) = c.load_certs().unwrap_or_default(); assert_eq!(ca, vec![0]); diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index f5a35c9bb2c..554dbaa63f9 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -33,55 +33,57 @@ nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] [dependencies] -api_version = { path = "../api_version" } -backup = { path = "../backup", default-features = false } -backup-stream = { path = "../backup-stream", default-features = false } -causal_ts = { path = "../causal_ts" } -cdc = { path = "../cdc", default-features = false } +api_version = { workspace = true } +backup = { workspace = true } +backup-stream = { workspace = true } +causal_ts = { workspace = true } +cdc = { workspace = true } chrono = "0.4" clap = "2.32" -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -encryption = { path = "../encryption", default-features = false } -encryption_export = { path = "../encryption/export", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_rocks_helper = { path = "../engine_rocks_helper" } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } -file_system = { path = "../file_system", default-features = false } +encryption = { workspace = true } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_rocks_helper = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +file_system = { workspace = true } fs2 = "0.4" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored"] } +grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } hex = "0.4" -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +keys = { workspace = true } +kvproto = { workspace = true } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } -nix = "0.23" -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raft_log_engine = { path = "../raft_log_engine", default-features = false } -raftstore = { path = "../raftstore", default-features = false } +raft_log_engine = { workspace = true } +raftstore = { workspace = true, features = ["engine_rocks"] } +raftstore-v2 = { workspace = true } rand = "0.8" -resolved_ts = { path = "../../components/resolved_ts", default-features = false } -resource_metering = { path = "../resource_metering" } -security = { path = "../security", default-features = false } +resolved_ts = { workspace = true } +resource_control = { workspace = true } +resource_metering = { workspace = true } +security = { workspace = true } serde_json = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } +snap_recovery = { workspace = true } tempfile = "3.0" -tikv = { path = "../..", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } toml = "0.5" -txn_types = { path = "../txn_types", default-features = false } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +txn_types = { workspace = true } +yatp = { workspace = true } [target.'cfg(unix)'.dependencies] -signal = "0.6" +signal-hook = "0.3" diff --git a/components/server/src/common.rs b/components/server/src/common.rs new file mode 100644 index 00000000000..10da6ec9c74 --- /dev/null +++ b/components/server/src/common.rs @@ -0,0 +1,791 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +//! This mod is exported to make convenience for creating TiKV-like servers. + +use std::{ + cmp, + collections::HashMap, + env, fmt, + net::SocketAddr, + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicU32, Ordering}, + mpsc, Arc, + }, + time::Duration, + u64, +}; + +use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use engine_rocks::{ + flush_engine_statistics, + raw::{Cache, Env}, + FlowInfo, RocksEngine, RocksStatistics, +}; +use engine_traits::{ + data_cf_offset, CachedTablet, CfOptionsExt, FlowControlFactorsExt, KvEngine, RaftEngine, + StatisticsReporter, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, +}; +use error_code::ErrorCodeExt; +use file_system::{get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor}; +use grpcio::Environment; +use pd_client::{PdClient, RpcClient}; +use raft_log_engine::RaftLogEngine; +use security::SecurityManager; +use tikv::{ + config::{ConfigController, DbConfigManger, DbType, TikvConfig}, + server::{status_server::StatusServer, DEFAULT_CLUSTER_ID}, +}; +use tikv_util::{ + config::{ensure_dir_exist, RaftDataStateMachine}, + math::MovingAvgU32, + metrics::INSTANCE_BACKEND_CPU_QUOTA, + quota_limiter::QuotaLimiter, + sys::{cpu_time::ProcessStat, disk, path_in_diff_mount_point, SysQuota}, + time::Instant, + worker::{LazyWorker, Worker}, +}; + +use crate::{raft_engine_switch::*, setup::validate_and_persist_config}; + +// minimum number of core kept for background requests +const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; +// max ratio of core quota for background requests +const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; +// default ratio of core quota for background requests = core_number * 0.5 +const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; +// indication of TiKV instance is short of cpu +const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; +// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) +const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; +// pace of cpu quota adjustment +const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu +const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); + +/// This is the common part of TiKV-like servers. It is a collection of all +/// capabilities a TikvServer should have or may take advantage of. By holding +/// it in its own TikvServer implementation, one can easily access the common +/// ability of a TiKV server. +// Fields in this struct are all public since they are open for other TikvServer +// to use, e.g. a custom TikvServer may alter some fields in `config` or push +// some services into `to_stop`. +pub struct TikvServerCore { + pub config: TikvConfig, + pub store_path: PathBuf, + pub lock_files: Vec, + pub encryption_key_manager: Option>, + pub flow_info_sender: Option>, + pub flow_info_receiver: Option>, + pub to_stop: Vec>, + pub background_worker: Worker, +} + +impl TikvServerCore { + /// Initialize and check the config + /// + /// Warnings are logged and fatal errors exist. + /// + /// # Fatal errors + /// + /// - If `dynamic config` feature is enabled and failed to register config + /// to PD + /// - If some critical configs (like data dir) are differrent from last run + /// - If the config can't pass `validate()` + /// - If the max open file descriptor limit is not high enough to support + /// the main database and the raft database. + pub fn init_config(mut config: TikvConfig) -> ConfigController { + validate_and_persist_config(&mut config, true); + + ensure_dir_exist(&config.storage.data_dir).unwrap(); + if !config.rocksdb.wal_dir.is_empty() { + ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); + } + if config.raft_engine.enable { + ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); + } else { + ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); + if !config.raftdb.wal_dir.is_empty() { + ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); + } + } + + check_system_config(&config); + + tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); + + info!( + "using config"; + "config" => serde_json::to_string(&config).unwrap(), + ); + if config.panic_when_unexpected_key_or_data { + info!("panic-when-unexpected-key-or-data is on"); + tikv_util::set_panic_when_unexpected_key_or_data(true); + } + + config.write_into_metrics(); + + ConfigController::new(config) + } + + pub fn check_conflict_addr(&mut self) { + let cur_addr: SocketAddr = self + .config + .server + .addr + .parse() + .expect("failed to parse into a socket address"); + let cur_ip = cur_addr.ip(); + let cur_port = cur_addr.port(); + let lock_dir = get_lock_dir(); + + let search_base = env::temp_dir().join(lock_dir); + file_system::create_dir_all(&search_base) + .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); + + for entry in file_system::read_dir(&search_base).unwrap().flatten() { + if !entry.file_type().unwrap().is_file() { + continue; + } + let file_path = entry.path(); + let file_name = file_path.file_name().unwrap().to_str().unwrap(); + if let Ok(addr) = file_name.replace('_', ":").parse::() { + let ip = addr.ip(); + let port = addr.port(); + if cur_port == port + && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) + { + let _ = try_lock_conflict_addr(file_path); + } + } + } + + let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); + let cur_file = try_lock_conflict_addr(cur_path); + self.lock_files.push(cur_file); + } + + pub fn init_fs(&mut self) { + let lock_path = self.store_path.join(Path::new("LOCK")); + + let f = File::create(lock_path.as_path()) + .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); + if f.try_lock_exclusive().is_err() { + fatal!( + "lock {} failed, maybe another instance is using this directory.", + self.store_path.display() + ); + } + self.lock_files.push(f); + + if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { + fatal!( + "panic_mark_file {} exists, there must be something wrong with the db. \ + Do not remove the panic_mark_file and force the TiKV node to restart. \ + Please contact TiKV maintainers to investigate the issue. \ + If needed, use scale in and scale out to replace the TiKV node. \ + https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", + tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() + ); + } + + // Allocate a big file to make sure that TiKV have enough space to + // recover from disk full errors. This file is created in data_dir rather than + // db_path, because we must not increase store size of db_path. + fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { + let mut reserved_size = reserved_size_from_config; + if reserved_size_from_config != 0 { + reserved_size = + cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); + } + reserved_size + } + fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { + let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); + if let Err(e) = file_system::remove_file(path) { + warn!("failed to remove space holder on starting: {}", e); + } + + // place holder file size is 20% of total reserved space. + if available > reserved_size { + file_system::reserve_space_for_recover(data_dir, reserved_size / 5) + .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) + .unwrap(); + } else { + warn!("no enough disk space left to create the place holder file"); + } + } + + let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_stats.total_space(); + if self.config.raft_store.capacity.0 > 0 { + capacity = cmp::min(capacity, self.config.raft_store.capacity.0); + } + // reserve space for kv engine + let kv_reserved_size = + calculate_reserved_space(capacity, self.config.storage.reserve_space.0); + disk::set_disk_reserved_space(kv_reserved_size); + reserve_physical_space( + &self.config.storage.data_dir, + disk_stats.available_space(), + kv_reserved_size, + ); + + let raft_data_dir = if self.config.raft_engine.enable { + self.config.raft_engine.config().dir + } else { + self.config.raft_store.raftdb_path.clone() + }; + + let separated_raft_mount_path = + path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); + if separated_raft_mount_path { + let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + // reserve space for raft engine if raft engine is deployed separately + let raft_reserved_size = calculate_reserved_space( + raft_disk_stats.total_space(), + self.config.storage.reserve_raft_space.0, + ); + disk::set_raft_disk_reserved_space(raft_reserved_size); + reserve_physical_space( + &raft_data_dir, + raft_disk_stats.available_space(), + raft_reserved_size, + ); + } + } + + pub fn init_yatp(&self) { + yatp::metrics::set_namespace(Some("tikv")); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); + } + + pub fn init_encryption(&mut self) { + self.encryption_key_manager = data_key_manager_from_config( + &self.config.security.encryption, + &self.config.storage.data_dir, + ) + .map_err(|e| { + panic!( + "Encryption failed to initialize: {}. code: {}", + e, + e.error_code() + ) + }) + .unwrap() + .map(Arc::new); + } + + pub fn init_io_utility(&mut self) -> BytesFetcher { + let stats_collector_enabled = file_system::init_io_stats_collector() + .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) + .is_ok(); + + let limiter = Arc::new( + self.config + .storage + .io_rate_limit + .build(!stats_collector_enabled /* enable_statistics */), + ); + let fetcher = if stats_collector_enabled { + BytesFetcher::FromIoStatsCollector() + } else { + BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) + }; + // Set up IO limiter even when rate limit is disabled, so that rate limits can + // be dynamically applied later on. + set_io_rate_limiter(Some(limiter)); + fetcher + } + + pub fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { + let (tx, rx) = mpsc::channel(); + self.flow_info_sender = Some(tx.clone()); + self.flow_info_receiver = Some(rx); + engine_rocks::FlowListener::new(tx) + } + + pub fn connect_to_pd_cluster( + config: &mut TikvConfig, + env: Arc, + security_mgr: Arc, + ) -> Arc { + let pd_client = Arc::new( + RpcClient::new(&config.pd, Some(env), security_mgr) + .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), + ); + + let cluster_id = pd_client + .get_cluster_id() + .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); + if cluster_id == DEFAULT_CLUSTER_ID { + fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); + } + config.server.cluster_id = cluster_id; + info!( + "connect to PD cluster"; + "cluster_id" => cluster_id + ); + + pd_client + } + + // Only background cpu quota tuning is implemented at present. iops and frontend + // quota tuning is on the way + pub fn init_quota_tuning_task(&self, quota_limiter: Arc) { + // No need to do auto tune when capacity is really low + if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO + < BACKGROUND_REQUEST_CORE_LOWER_BOUND + { + return; + }; + + // Determine the base cpu quota + let base_cpu_quota = + // if cpu quota is not specified, start from optimistic case + if quota_limiter.cputime_limiter(false).is_infinite() { + 1000_f64 + * f64::max( + BACKGROUND_REQUEST_CORE_LOWER_BOUND, + SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, + ) + } else { + quota_limiter.cputime_limiter(false) / 1000_f64 + }; + + // Calculate the celling and floor quota + let celling_quota = f64::min( + base_cpu_quota * 2.0, + 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, + ); + let floor_quota = f64::max( + base_cpu_quota * 0.5, + 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, + ); + + let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); + self.background_worker.spawn_interval_task( + DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, + move || { + if quota_limiter.auto_tune_enabled() { + let cputime_limit = quota_limiter.cputime_limiter(false); + let old_quota = if cputime_limit.is_infinite() { + base_cpu_quota + } else { + cputime_limit / 1000_f64 + }; + let cpu_usage = match proc_stats.cpu_usage() { + Ok(r) => r, + Err(_e) => 0.0, + }; + // Try tuning quota when cpu_usage is correctly collected. + // rule based tuning: + // - if instance is busy, shrink cpu quota for analyze by one quota pace until + // lower bound is hit; + // - if instance cpu usage is healthy, no op; + // - if instance is idle, increase cpu quota by one quota pace until upper + // bound is hit. + if cpu_usage > 0.0f64 { + let mut target_quota = old_quota; + + let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); + if cpu_util >= SYSTEM_BUSY_THRESHOLD { + target_quota = + f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); + } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { + target_quota = + f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); + } + + if old_quota != target_quota { + quota_limiter.set_cpu_time_limit(target_quota as usize, false); + debug!( + "cpu_time_limiter tuned for backend request"; + "cpu_util" => ?cpu_util, + "new quota" => ?target_quota); + INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); + } + } + } + }, + ); + } +} + +#[cfg(unix)] +fn get_lock_dir() -> String { + format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) +} + +#[cfg(not(unix))] +fn get_lock_dir() -> String { + "TIKV_LOCK_FILES".to_owned() +} + +fn try_lock_conflict_addr>(path: P) -> File { + let f = File::create(path.as_ref()).unwrap_or_else(|e| { + fatal!( + "failed to create lock at {}: {}", + path.as_ref().display(), + e + ) + }); + + if f.try_lock_exclusive().is_err() { + fatal!( + "{} already in use, maybe another instance is binding with this address.", + path.as_ref().file_name().unwrap().to_str().unwrap() + ); + } + f +} + +const RESERVED_OPEN_FDS: u64 = 1000; +pub fn check_system_config(config: &TikvConfig) { + info!("beginning system configuration check"); + let mut rocksdb_max_open_files = config.rocksdb.max_open_files; + if config.rocksdb.titan.enabled { + // Titan engine maintains yet another pool of blob files and uses the same max + // number of open files setup as rocksdb does. So we double the max required + // open files here + rocksdb_max_open_files *= 2; + } + if let Err(e) = tikv_util::config::check_max_open_fds( + RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, + ) { + fatal!("{}", e); + } + + // Check RocksDB data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { + warn!( + "check: rocksdb-data-dir"; + "path" => &config.storage.data_dir, + "err" => %e + ); + } + // Check raft data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { + warn!( + "check: raftdb-path"; + "path" => &config.raft_store.raftdb_path, + "err" => %e + ); + } +} + +pub struct EnginesResourceInfo { + tablet_registry: TabletRegistry, + raft_engine: Option, + latest_normalized_pending_bytes: AtomicU32, + normalized_pending_bytes_collector: MovingAvgU32, +} + +impl EnginesResourceInfo { + const SCALE_FACTOR: u64 = 100; + + pub fn new( + tablet_registry: TabletRegistry, + raft_engine: Option, + max_samples_to_preserve: usize, + ) -> Self { + EnginesResourceInfo { + tablet_registry, + raft_engine, + latest_normalized_pending_bytes: AtomicU32::new(0), + normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), + } + } + + pub fn update( + &self, + _now: Instant, + cached_latest_tablets: &mut HashMap>, + ) { + let mut compaction_pending_bytes = [0; DATA_CFS.len()]; + let mut soft_pending_compaction_bytes_limit = [0; DATA_CFS.len()]; + + let mut fetch_engine_cf = |engine: &RocksEngine, cf: &str| { + if let Ok(cf_opts) = engine.get_options_cf(cf) { + if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { + let offset = data_cf_offset(cf); + compaction_pending_bytes[offset] += b; + soft_pending_compaction_bytes_limit[offset] = cmp::max( + cf_opts.get_soft_pending_compaction_bytes_limit(), + soft_pending_compaction_bytes_limit[offset], + ); + } + } + }; + + if let Some(raft_engine) = &self.raft_engine { + fetch_engine_cf(raft_engine, CF_DEFAULT); + } + + self.tablet_registry + .for_each_opened_tablet(|id, db: &mut CachedTablet| { + cached_latest_tablets.insert(id, db.clone()); + true + }); + + // todo(SpadeA): Now, there's a potential race condition problem where the + // tablet could be destroyed after the clone and before the fetching + // which could result in programme panic. It's okay now as the single global + // kv_engine will not be destroyed in normal operation and v2 is not + // ready for operation. Furthermore, this race condition is general to v2 as + // tablet clone is not a case exclusively happened here. We should + // propose another PR to tackle it such as destory tablet lazily in a GC + // thread. + + for (_, cache) in cached_latest_tablets.iter_mut() { + let Some(tablet) = cache.latest() else { continue }; + for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { + fetch_engine_cf(tablet, cf); + } + } + + // Clear ensures that these tablets are not hold forever. + cached_latest_tablets.clear(); + + let mut normalized_pending_bytes = 0; + for (pending, limit) in compaction_pending_bytes + .iter() + .zip(soft_pending_compaction_bytes_limit) + { + if limit > 0 { + normalized_pending_bytes = cmp::max( + normalized_pending_bytes, + (*pending * EnginesResourceInfo::SCALE_FACTOR / limit) as u32, + ) + } + } + + let (_, avg) = self + .normalized_pending_bytes_collector + .add(normalized_pending_bytes); + self.latest_normalized_pending_bytes.store( + std::cmp::max(normalized_pending_bytes, avg), + Ordering::Relaxed, + ); + } + + #[cfg(any(test, feature = "testexport"))] + pub fn latest_normalized_pending_bytes(&self) -> u32 { + self.latest_normalized_pending_bytes.load(Ordering::Relaxed) + } +} + +impl IoBudgetAdjustor for EnginesResourceInfo { + fn adjust(&self, total_budgets: usize) -> usize { + let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 + / Self::SCALE_FACTOR as f32; + // Two reasons for adding `sqrt` on top: + // 1) In theory the convergence point is independent of the value of pending + // bytes (as long as backlog generating rate equals consuming rate, which is + // determined by compaction budgets), a convex helps reach that point while + // maintaining low level of pending bytes. + // 2) Variance of compaction pending bytes grows with its magnitude, a filter + // with decreasing derivative can help balance such trend. + let score = score.sqrt(); + // The target global write flow slides between Bandwidth / 2 and Bandwidth. + let score = 0.5 + score / 2.0; + (total_budgets as f32 * score) as usize + } +} + +/// A small trait for components which can be trivially stopped. Lets us keep +/// a list of these in `TiKV`, rather than storing each component individually. +pub trait Stop { + fn stop(self: Box); +} + +impl Stop for StatusServer +where + R: 'static + Send, +{ + fn stop(self: Box) { + (*self).stop() + } +} + +impl Stop for Worker { + fn stop(self: Box) { + Worker::stop(&self); + } +} + +impl Stop for LazyWorker { + fn stop(self: Box) { + self.stop_worker(); + } +} + +pub trait ConfiguredRaftEngine: RaftEngine { + fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>); + fn as_rocks_engine(&self) -> Option<&RocksEngine>; + fn register_config(&self, _cfg_controller: &mut ConfigController); +} + +impl ConfiguredRaftEngine for T { + default fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>) { + unimplemented!() + } + default fn as_rocks_engine(&self) -> Option<&RocksEngine> { + None + } + default fn register_config(&self, _cfg_controller: &mut ConfigController) {} +} + +impl ConfiguredRaftEngine for RocksEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_engine.config().dir, + &config.raft_store.raftdb_path, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_db_path = &config.raft_store.raftdb_path; + let config_raftdb = &config.raftdb; + let statistics = Arc::new(RocksStatistics::new_titan()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); + + if should_dump { + let raft_engine = + RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) + .expect("failed to open raft engine for migration"); + dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); + raft_engine.stop(); + drop(raft_engine); + raft_data_state_machine.after_dump_data(); + } + (raftdb, Some(statistics)) + } + + fn as_rocks_engine(&self) -> Option<&RocksEngine> { + Some(self) + } + + fn register_config(&self, cfg_controller: &mut ConfigController) { + cfg_controller.register( + tikv::config::Module::Raftdb, + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + ); + } +} + +impl ConfiguredRaftEngine for RaftLogEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_store.raftdb_path, + &config.raft_engine.config().dir, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_config = config.raft_engine.config(); + let raft_engine = + RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) + .expect("failed to open raft engine"); + + if should_dump { + let config_raftdb = &config.raftdb; + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt( + &config.raft_store.raftdb_path, + raft_db_opts, + raft_cf_opts, + ) + .expect("failed to open raftdb for migration"); + dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); + raftdb.stop(); + drop(raftdb); + raft_data_state_machine.after_dump_data(); + } + (raft_engine, None) + } +} + +const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + last_reset: Instant, +} + +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { + EngineMetricsManager { + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, + last_reset: Instant::now(), + } + } + + pub fn flush(&mut self, now: Instant) { + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } + if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } + self.last_reset = now; + } + } +} diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index 8a46f601a75..144cc1885d5 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -1,11 +1,17 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(incomplete_features)] +#![feature(specialization)] +#![feature(let_chains)] + #[macro_use] extern crate tikv_util; #[macro_use] pub mod setup; +pub mod common; pub mod memory; pub mod raft_engine_switch; pub mod server; +pub mod server2; pub mod signal_handler; diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index 586a3999b82..bf46f07eabd 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -7,7 +7,7 @@ use std::sync::{ use crossbeam::channel::{unbounded, Receiver}; use engine_rocks::{self, RocksEngine}; -use engine_traits::{Iterable, Iterator, RaftEngine, RaftEngineReadOnly, RaftLogBatch, SeekKey}; +use engine_traits::{Iterable, Iterator, RaftEngine, RaftEngineReadOnly, RaftLogBatch, CF_DEFAULT}; use kvproto::raft_serverpb::RaftLocalState; use protobuf::Message; use raft::eraftpb::Entry; @@ -36,8 +36,8 @@ pub fn dump_raftdb_to_raft_engine(source: &RocksEngine, target: &RaftLogEngine, info!("Start to scan raft log from RocksEngine and dump into RaftLogEngine"); let consumed_time = tikv_util::time::Instant::now(); // Seek all region id from raftdb and send them to workers. - let mut it = source.iterator().unwrap(); - let mut valid = it.seek(SeekKey::Key(keys::REGION_RAFT_MIN_KEY)).unwrap(); + let mut it = source.iterator(CF_DEFAULT).unwrap(); + let mut valid = it.seek(keys::REGION_RAFT_MIN_KEY).unwrap(); while valid { match keys::decode_raft_key(it.key()) { Err(e) => { @@ -47,7 +47,7 @@ pub fn dump_raftdb_to_raft_engine(source: &RocksEngine, target: &RaftLogEngine, tx.send(id).unwrap(); count_region += 1; let next_key = keys::raft_log_prefix(id + 1); - valid = it.seek(SeekKey::Key(&next_key)).unwrap(); + valid = it.seek(&next_key).unwrap(); } } } @@ -115,7 +115,7 @@ fn check_raft_engine_is_empty(engine: &RaftLogEngine) { fn check_raft_db_is_empty(engine: &RocksEngine) { let mut count = 0; engine - .scan(b"", &[0xFF, 0xFF], false, |_, _| { + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |_, _| { count += 1; Ok(false) }) @@ -138,6 +138,7 @@ fn run_dump_raftdb_worker( let mut entries = vec![]; old_engine .scan( + CF_DEFAULT, &keys::raft_log_prefix(id), &keys::raft_log_prefix(id + 1), false, @@ -157,9 +158,10 @@ fn run_dump_raftdb_worker( let mut state = RaftLocalState::default(); state.merge_from_bytes(value)?; batch.put_raft_state(region_id, &state).unwrap(); - // Assume that we always scan entry first and raft state at the end. + // Assume that we always scan entry first and raft state at the + // end. batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); } _ => unreachable!("There is only 2 types of keys in raft"), @@ -168,7 +170,7 @@ fn run_dump_raftdb_worker( if local_size >= BATCH_THRESHOLD { local_size = 0; batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); @@ -191,11 +193,11 @@ fn run_dump_raft_engine_worker( new_engine: &RocksEngine, count_size: &Arc, ) { + let mut batch = new_engine.log_batch(0); while let Ok(id) = rx.recv() { let state = old_engine.get_raft_state(id).unwrap().unwrap(); - new_engine.put_raft_state(id, &state).unwrap(); + batch.put_raft_state(id, &state).unwrap(); if let Some(last_index) = old_engine.last_index(id) { - let mut batch = new_engine.log_batch(0); let mut begin = old_engine.first_index(id).unwrap(); while begin <= last_index { let end = std::cmp::min(begin + 1024, last_index + 1); @@ -203,18 +205,20 @@ fn run_dump_raft_engine_worker( begin += old_engine .fetch_entries_to(id, begin, end, Some(BATCH_THRESHOLD), &mut entries) .unwrap() as u64; - batch.append(id, entries).unwrap(); + batch.append(id, None, entries).unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); count_size.fetch_add(size, Ordering::Relaxed); } } + if !batch.is_empty() { + new_engine.consume(&mut batch, false).unwrap(); + } } } #[cfg(test)] mod tests { - use engine_rocks::raw::DBOptions; - use tikv::config::TiKvConfig; + use tikv::config::TikvConfig; use super::*; @@ -229,28 +233,31 @@ mod tests { raftdb_wal_path.push("test-wal"); } - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.raft_store.raftdb_path = raftdb_path.to_str().unwrap().to_owned(); cfg.raftdb.wal_dir = raftdb_wal_path.to_str().unwrap().to_owned(); cfg.raft_engine.mut_config().dir = raft_engine_path.to_str().unwrap().to_owned(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); // Dump logs from RocksEngine to RaftLogEngine. let raft_engine = RaftLogEngine::new( cfg.raft_engine.config(), - None, /*key_manager*/ - None, /*io_rate_limiter*/ + None, // key_manager + None, // io_rate_limiter ) .expect("open raft engine"); { // Prepare some data for the RocksEngine. - let raftdb = engine_rocks::raw_util::new_engine_opt( + let raftdb = engine_rocks::util::new_engine_opt( &cfg.raft_store.raftdb_path, - cfg.raftdb.build_opt(), - cfg.raftdb.build_cf_opts(&None), + cfg.raftdb.build_opt(Default::default(), None), + cfg.raftdb.build_cf_opts(&cache), ) .unwrap(); - let raftdb = RocksEngine::from_db(Arc::new(raftdb)); let mut batch = raftdb.log_batch(0); set_write_batch(1, &mut batch); raftdb.consume(&mut batch, false).unwrap(); @@ -270,15 +277,8 @@ mod tests { std::fs::remove_dir_all(&cfg.raft_store.raftdb_path).unwrap(); // Dump logs from RaftLogEngine to RocksEngine. - let raftdb = { - let db = engine_rocks::raw_util::new_engine_opt( - &cfg.raft_store.raftdb_path, - DBOptions::new(), - vec![], - ) - .unwrap(); - RocksEngine::from_db(Arc::new(db)) - }; + let raftdb = + engine_rocks::util::new_engine(&cfg.raft_store.raftdb_path, &[CF_DEFAULT]).unwrap(); dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 4); assert(1, &raftdb); assert(5, &raftdb); @@ -306,7 +306,7 @@ mod tests { e.set_index(i); entries.push(e); } - batch.append(num, entries).unwrap(); + batch.append(num, None, entries).unwrap(); } // Get data from raft engine and assert. diff --git a/components/server/src/server.rs b/components/server/src/server.rs index b9f3c7bd6f2..625e8d8a31b 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -2,69 +2,61 @@ //! This module startups all the components of a TiKV server. //! -//! It is responsible for reading from configs, starting up the various server components, -//! and handling errors (mostly by aborting and reporting to the user). +//! It is responsible for reading from configs, starting up the various server +//! components, and handling errors (mostly by aborting and reporting to the +//! user). //! //! The entry point is `run_tikv`. //! -//! Components are often used to initialize other components, and/or must be explicitly stopped. -//! We keep these components in the `TiKvServer` struct. +//! Components are often used to initialize other components, and/or must be +//! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ cmp, + collections::HashMap, convert::TryFrom, - env, fmt, - net::SocketAddr, path::{Path, PathBuf}, str::FromStr, - sync::{ - atomic::{AtomicU32, AtomicU64, Ordering}, - mpsc, Arc, Mutex, - }, + sync::{atomic::AtomicU64, mpsc, Arc, Mutex}, time::Duration, u64, }; use api_version::{dispatch_api_version, KvFormat}; use backup_stream::{ - config::BackupStreamConfigManager, - metadata::{ConnectionConfig, LazyEtcdClient}, - observer::BackupStreamObserver, + config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, + BackupStreamResolver, }; +use causal_ts::CausalTsProviderImpl; use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; -use encryption_export::{data_key_manager_from_config, DataKeyManager}; -use engine_rocks::{ - from_rocks_compression_type, - raw::{Cache, Env}, - FlowInfo, RocksEngine, -}; +use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions, Engines, FlowControlFactorsExt, KvEngine, MiscExt, - RaftEngine, TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, -}; -use error_code::ErrorCodeExt; -use file_system::{ - get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IOBudgetAdjustor, - MetricsManager as IOMetricsManager, + Engines, KvEngine, MiscExt, RaftEngine, SingletonFactory, TabletContext, TabletRegistry, + CF_DEFAULT, CF_WRITE, }; +use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetricsManager}; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; use kvproto::{ brpb::create_backup, cdcpb::create_change_data, deadlock::create_deadlock, debugpb::create_debug, diagnosticspb::create_diagnostics, import_sstpb::create_import_sst, - kvrpcpb::ApiVersion, resource_usage_agent::create_resource_metering_pub_sub, + kvrpcpb::ApiVersion, logbackuppb::create_log_backup, recoverdatapb::create_recover_data, + resource_usage_agent::create_resource_metering_pub_sub, +}; +use pd_client::{ + meta_storage::{Checked, Sourced}, + PdClient, RpcClient, }; -use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; use raftstore::{ coprocessor::{ config::SplitCheckConfigManager, BoxConsistencyCheckObserver, ConsistencyCheckMethod, CoprocessorHost, RawConsistencyCheckObserver, RegionInfoAccessor, }, - router::ServerRaftStoreRouter, + router::{CdcRaftRouter, ServerRaftStoreRouter}, store::{ config::RaftstoreConfigManager, fsm, @@ -72,64 +64,83 @@ use raftstore::{ RaftBatchSystem, RaftRouter, StoreMeta, MULTI_FILES_SNAPSHOT_FEATURE, PENDING_MSG_CAP, }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, - AutoSplitController, CheckLeaderRunner, GlobalReplicationState, LocalReader, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, + AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, + SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, + RaftRouterCompactedEventSender, +}; +use resolved_ts::LeadershipResolver; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, }; use security::SecurityManager; +use snap_recovery::RecoveryService; use tikv::{ - config::{ConfigController, DBConfigManger, DBType, TiKvConfig}, + config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, - read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, + read_pool::{ + build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, + }, server::{ config::{Config as ServerConfig, ServerConfigManager}, - create_raft_storage, gc_worker::{AutoGcConfig, GcWorker}, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, resolve, service::{DebugService, DiagnosticsService}, status_server::StatusServer, + tablet_snap::NoSnapshotCache, ttl::TtlChecker, - KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, - GRPC_THREAD_PREFIX, + KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, GRPC_THREAD_PREFIX, }, storage::{ - self, config_manager::StorageConfigManger, mvcc::MvccConsistencyCheckObserver, - txn::flow_controller::FlowController, Engine, + self, + config::EngineType, + config_manager::StorageConfigManger, + kv::LocalTablets, + mvcc::MvccConsistencyCheckObserver, + txn::flow_controller::{EngineFlowController, FlowController}, + Engine, Storage, }, }; use tikv_util::{ check_environment_variables, - config::{ensure_dir_exist, RaftDataStateMachine, VersionTrack}, - math::MovingAvgU32, + config::VersionTrack, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, - sys::{disk, register_memory_usage_high_water, SysQuota}, + sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + yatp_pool::CleanupMethod, + Either, }; use tokio::runtime::Builder; -use crate::{memory::*, raft_engine_switch::*, setup::*, signal_handler}; +use crate::{ + common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + memory::*, + setup::*, + signal_handler, + tikv_util::sys::thread::ThreadBuildWrapper, +}; #[inline] -fn run_impl(config: TiKvConfig) { - let mut tikv = TiKvServer::::init(config); +fn run_impl(config: TikvConfig) { + let mut tikv = TikvServer::::init::(config); - // Must be called after `TiKvServer::init`. - let memory_limit = tikv.config.memory_usage_limit.unwrap().0; - let high_water = (tikv.config.memory_usage_high_water * memory_limit as f64) as u64; + // Must be called after `TikvServer::init`. + let memory_limit = tikv.core.config.memory_usage_limit.unwrap().0; + let high_water = (tikv.core.config.memory_usage_high_water * memory_limit as f64) as u64; register_memory_usage_high_water(high_water); - tikv.check_conflict_addr(); - tikv.init_fs(); - tikv.init_yatp(); - tikv.init_encryption(); - let fetcher = tikv.init_io_utility(); - let listener = tikv.init_flow_receiver(); + tikv.core.check_conflict_addr(); + tikv.core.init_fs(); + tikv.core.init_yatp(); + tikv.core.init_encryption(); + let fetcher = tikv.core.init_io_utility(); + let listener = tikv.core.init_flow_receiver(); let (engines, engines_info) = tikv.init_raw_engines(listener); tikv.init_engines(engines.clone()); let server_config = tikv.init_servers::(); @@ -138,14 +149,19 @@ fn run_impl(config: TiKvConfig) { tikv.init_storage_stats_task(engines); tikv.run_server(server_config); tikv.run_status_server(); + tikv.core.init_quota_tuning_task(tikv.quota_limiter.clone()); - signal_handler::wait_for_signal(Some(tikv.engines.take().unwrap().engines)); + signal_handler::wait_for_signal( + Some(tikv.engines.take().unwrap().engines), + tikv.kv_statistics.clone(), + tikv.raft_statistics.clone(), + ); tikv.stop(); } /// Run a TiKV server. Returns when the server is shutdown by the user, in which /// case the server will be properly stopped. -pub fn run_tikv(config: TiKvConfig) { +pub fn run_tikv(config: TikvConfig) { // Sets the global logger ASAP. // It is okay to use the config w/o `validate()`, // because `initial_logger()` handles various conditions. @@ -173,42 +189,38 @@ pub fn run_tikv(config: TiKvConfig) { }) } -const RESERVED_OPEN_FDS: u64 = 1000; - const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); -const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); /// A complete TiKV server. -struct TiKvServer { - config: TiKvConfig, +struct TikvServer { + core: TikvServerCore, cfg_controller: Option, security_mgr: Arc, pd_client: Arc, router: RaftRouter, - flow_info_sender: Option>, - flow_info_receiver: Option>, system: Option>, - resolver: resolve::PdStoreAddrResolver, - state: Arc>, - store_path: PathBuf, + resolver: Option, snap_mgr: Option, // Will be filled in `init_servers`. - encryption_key_manager: Option>, - engines: Option>, + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, servers: Option>, region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, - to_stop: Vec>, - lock_files: Vec, concurrency_manager: ConcurrencyManager, env: Arc, - background_worker: Worker, + check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Option>, + causal_ts_provider: Option>, // used for rawkv apiv2 + tablet_registry: Option>, + br_snap_recovery_mode: bool, // use for br snapshot recovery } -struct TiKvEngines { +struct TikvEngines { engines: Engines, store_meta: Arc>, engine: RaftKv>, @@ -222,14 +234,17 @@ struct Servers { cdc_scheduler: tikv_util::worker::Scheduler, cdc_memory_quota: MemoryQuota, rsmeter_pubsub_service: resource_metering::PubSubService, + backup_stream_scheduler: Option>, } -type LocalServer = - Server, resolve::PdStoreAddrResolver, LocalRaftKv>; +type LocalServer = Server>; type LocalRaftKv = RaftKv>; -impl TiKvServer { - fn init(mut config: TiKvConfig) -> TiKvServer { +impl TikvServer +where + ER: RaftEngine, +{ + fn init(mut config: TikvConfig) -> TikvServer { tikv_util::thread_group::set_properties(Some(GroupProperties::default())); // It is okay use pd config and security config before `init_config`, // because these configs must be provided by command line, and only @@ -244,24 +259,68 @@ impl TiKvServer { .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) .build(), ); - let pd_client = - Self::connect_to_pd_cluster(&mut config, env.clone(), Arc::clone(&security_mgr)); + let pd_client = TikvServerCore::connect_to_pd_cluster( + &mut config, + env.clone(), + Arc::clone(&security_mgr), + ); + // check if TiKV need to run in snapshot recovery mode + let is_recovering_marked = match pd_client.is_recovering_marked() { + Err(e) => { + warn!( + "failed to get recovery mode from PD"; + "error" => ?e, + ); + false + } + Ok(marked) => marked, + }; + + if is_recovering_marked { + // Run a TiKV server in recovery modeß + info!("TiKV running in Snapshot Recovery Mode"); + snap_recovery::init_cluster::enter_snap_recovery_mode(&mut config); + // connect_to_pd_cluster retreived the cluster id from pd + let cluster_id = config.server.cluster_id; + snap_recovery::init_cluster::start_recovery( + config.clone(), + cluster_id, + pd_client.clone(), + ); + } // Initialize and check config - let cfg_controller = Self::init_config(config); + let cfg_controller = TikvServerCore::init_config(config); let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); - // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); - let thread_count = config.server.background_thread_count; let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); - let (resolver, state) = - resolve::new_resolver(Arc::clone(&pd_client), &background_worker, router.clone()); + + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); + // spawn a task to periodically update the minimal virtual time of all resource + // groups. + let resource_mgr = mgr.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; + }); + Some(mgr) + } else { + None + }; + + // Initialize raftstore channels. + let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -273,248 +332,90 @@ impl TiKvServer { let latest_ts = block_on(pd_client.get_tso()).expect("failed to get timestamp from PD"); let concurrency_manager = ConcurrencyManager::new(latest_ts); + // use different quota for front-end and back-end requests let quota_limiter = Arc::new(QuotaLimiter::new( config.quota.foreground_cpu_time, config.quota.foreground_write_bandwidth, config.quota.foreground_read_bandwidth, + config.quota.background_cpu_time, + config.quota.background_write_bandwidth, + config.quota.background_read_bandwidth, config.quota.max_delay_duration, + config.quota.enable_auto_tune, )); - TiKvServer { - config, + let mut causal_ts_provider = None; + if let ApiVersion::V2 = F::TAG { + let tso = block_on(causal_ts::BatchTsoProvider::new_opt( + pd_client.clone(), + config.causal_ts.renew_interval.0, + config.causal_ts.alloc_ahead_buffer.0, + config.causal_ts.renew_batch_min_size, + config.causal_ts.renew_batch_max_size, + )); + if let Err(e) = tso { + fatal!("Causal timestamp provider initialize failed: {:?}", e); + } + causal_ts_provider = Some(Arc::new(tso.unwrap().into())); + info!("Causal timestamp provider startup."); + } + + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = WorkerBuilder::new("check-leader").thread_count(1).create(); + + TikvServer { + core: TikvServerCore { + config, + store_path, + lock_files: vec![], + encryption_key_manager: None, + flow_info_sender: None, + flow_info_receiver: None, + to_stop: vec![], + background_worker, + }, cfg_controller: Some(cfg_controller), security_mgr, pd_client, router, system: Some(system), - resolver, - state, - store_path, + resolver: None, snap_mgr: None, - encryption_key_manager: None, engines: None, + kv_statistics: None, + raft_statistics: None, servers: None, region_info_accessor, coprocessor_host, - to_stop: vec![], - lock_files: vec![], concurrency_manager, env, - background_worker, - flow_info_sender: None, - flow_info_receiver: None, + check_leader_worker, sst_worker: None, quota_limiter, + resource_manager, + causal_ts_provider, + tablet_registry: None, + br_snap_recovery_mode: is_recovering_marked, } } - /// Initialize and check the config - /// - /// Warnings are logged and fatal errors exist. - /// - /// # Fatal errors - /// - /// - If `dynamic config` feature is enabled and failed to register config to PD - /// - If some critical configs (like data dir) are differrent from last run - /// - If the config can't pass `validate()` - /// - If the max open file descriptor limit is not high enough to support - /// the main database and the raft database. - fn init_config(mut config: TiKvConfig) -> ConfigController { - validate_and_persist_config(&mut config, true); - - ensure_dir_exist(&config.storage.data_dir).unwrap(); - if !config.rocksdb.wal_dir.is_empty() { - ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); - } - if config.raft_engine.enable { - ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); - } else { - ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); - if !config.raftdb.wal_dir.is_empty() { - ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); - } - } - - check_system_config(&config); - - tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); - - info!( - "using config"; - "config" => serde_json::to_string(&config).unwrap(), - ); - if config.panic_when_unexpected_key_or_data { - info!("panic-when-unexpected-key-or-data is on"); - tikv_util::set_panic_when_unexpected_key_or_data(true); - } - - config.write_into_metrics(); - - ConfigController::new(config) - } - - fn connect_to_pd_cluster( - config: &mut TiKvConfig, - env: Arc, - security_mgr: Arc, - ) -> Arc { - let pd_client = Arc::new( - RpcClient::new(&config.pd, Some(env), security_mgr) - .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), - ); - - let cluster_id = pd_client - .get_cluster_id() - .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); - if cluster_id == DEFAULT_CLUSTER_ID { - fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); - } - config.server.cluster_id = cluster_id; - info!( - "connect to PD cluster"; - "cluster_id" => cluster_id - ); - - pd_client - } - - fn check_conflict_addr(&mut self) { - let cur_addr: SocketAddr = self - .config - .server - .addr - .parse() - .expect("failed to parse into a socket address"); - let cur_ip = cur_addr.ip(); - let cur_port = cur_addr.port(); - let lock_dir = get_lock_dir(); - - let search_base = env::temp_dir().join(&lock_dir); - file_system::create_dir_all(&search_base) - .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); - - for entry in file_system::read_dir(&search_base).unwrap().flatten() { - if !entry.file_type().unwrap().is_file() { - continue; - } - let file_path = entry.path(); - let file_name = file_path.file_name().unwrap().to_str().unwrap(); - if let Ok(addr) = file_name.replace('_', ":").parse::() { - let ip = addr.ip(); - let port = addr.port(); - if cur_port == port - && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) - { - let _ = try_lock_conflict_addr(file_path); - } - } - } - - let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); - let cur_file = try_lock_conflict_addr(cur_path); - self.lock_files.push(cur_file); - } - - fn init_fs(&mut self) { - let lock_path = self.store_path.join(Path::new("LOCK")); - - let f = File::create(lock_path.as_path()) - .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); - if f.try_lock_exclusive().is_err() { - fatal!( - "lock {} failed, maybe another instance is using this directory.", - self.store_path.display() - ); - } - self.lock_files.push(f); - - if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { - fatal!( - "panic_mark_file {} exists, there must be something wrong with the db. \ - Do not remove the panic_mark_file and force the TiKV node to restart. \ - Please contact TiKV maintainers to investigate the issue. \ - If needed, use scale in and scale out to replace the TiKV node. \ - https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", - tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() - ); - } - - // We truncate a big file to make sure that both raftdb and kvdb of TiKV have enough space - // to do compaction and region migration when TiKV recover. This file is created in - // data_dir rather than db_path, because we must not increase store size of db_path. - let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); - let mut capacity = disk_stats.total_space(); - if self.config.raft_store.capacity.0 > 0 { - capacity = cmp::min(capacity, self.config.raft_store.capacity.0); - } - let mut reserve_space = self.config.storage.reserve_space.0; - if self.config.storage.reserve_space.0 != 0 { - reserve_space = cmp::max( - (capacity as f64 * 0.05) as u64, - self.config.storage.reserve_space.0, - ); - } - disk::set_disk_reserved_space(reserve_space); - let path = - Path::new(&self.config.storage.data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); - if let Err(e) = file_system::remove_file(&path) { - warn!("failed to remove space holder on starting: {}", e); - } - - let available = disk_stats.available_space(); - // place holder file size is 20% of total reserved space. - if available > reserve_space { - file_system::reserve_space_for_recover( - &self.config.storage.data_dir, - reserve_space / 5, - ) - .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) - .unwrap(); - } else { - warn!("no enough disk space left to create the place holder file"); - } - } - - fn init_yatp(&self) { - yatp::metrics::set_namespace(Some("tikv")); - prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); - } - - fn init_encryption(&mut self) { - self.encryption_key_manager = data_key_manager_from_config( - &self.config.security.encryption, - &self.config.storage.data_dir, - ) - .map_err(|e| { - panic!( - "Encryption failed to initialize: {}. code: {}", - e, - e.error_code() - ) - }) - .unwrap() - .map(Arc::new); - } - - fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { - let (tx, rx) = mpsc::channel(); - self.flow_info_sender = Some(tx.clone()); - self.flow_info_receiver = Some(rx); - engine_rocks::FlowListener::new(tx) - } - fn init_engines(&mut self, engines: Engines) { let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); let engine = RaftKv::new( ServerRaftStoreRouter::new( self.router.clone(), - LocalReader::new(engines.kv.clone(), store_meta.clone(), self.router.clone()), + LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + self.router.clone(), + ), ), engines.kv.clone(), + self.region_info_accessor.region_leaders(), ); - self.engines = Some(TiKvEngines { + self.engines = Some(TikvEngines { engines, store_meta, engine, @@ -523,27 +424,15 @@ impl TiKvServer { fn init_gc_worker( &mut self, - ) -> GcWorker< - RaftKv>, - RaftRouter, - > { + ) -> GcWorker>> { let engines = self.engines.as_ref().unwrap(); - let mut gc_worker = GcWorker::new( + let gc_worker = GcWorker::new( engines.engine.clone(), - self.router.clone(), - self.flow_info_sender.take().unwrap(), - self.config.gc.clone(), + self.core.flow_info_sender.take().unwrap(), + self.core.config.gc.clone(), self.pd_client.feature_gate().clone(), + Arc::new(self.region_info_accessor.clone()), ); - gc_worker - .start() - .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); - gc_worker - .start_observe_lock_apply( - self.coprocessor_host.as_mut().unwrap(), - self.concurrency_manager.clone(), - ) - .unwrap_or_else(|e| fatal!("gc worker failed to observe lock apply: {}", e)); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( @@ -555,12 +444,12 @@ impl TiKvServer { } fn init_servers(&mut self) -> Arc> { - let flow_controller = Arc::new(FlowController::new( - &self.config.storage.flow_control, - self.engines.as_ref().unwrap().engine.kv_engine(), - self.flow_info_receiver.take().unwrap(), - )); - let gc_worker = self.init_gc_worker(); + let flow_controller = Arc::new(FlowController::Singleton(EngineFlowController::new( + &self.core.config.storage.flow_control, + self.engines.as_ref().unwrap().engine.kv_engine().unwrap(), + self.core.flow_info_receiver.take().unwrap(), + ))); + let mut gc_worker = self.init_gc_worker(); let mut ttl_checker = Box::new(LazyWorker::new("ttl-checker")); let ttl_scheduler = ttl_checker.scheduler(); @@ -573,6 +462,8 @@ impl TiKvServer { ))), ); + cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + // Create cdc. let mut cdc_worker = Box::new(LazyWorker::new("cdc")); let cdc_scheduler = cdc_worker.scheduler(); @@ -584,7 +475,7 @@ impl TiKvServer { .engine .set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); - let lock_mgr = LockManager::new(&self.config.pessimistic_txn); + let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); cfg_controller.register( tikv::config::Module::PessimisticTxn, Box::new(lock_mgr.config_manager()), @@ -598,23 +489,42 @@ impl TiKvServer { if let Some(sst_worker) = &mut self.sst_worker { let sst_runner = RecoveryRunner::new( - engines.engines.kv.get_sync_db(), + engines.engines.kv.clone(), engines.store_meta.clone(), - self.config.storage.background_error_recovery_window.into(), + self.core + .config + .storage + .background_error_recovery_window + .into(), DEFAULT_CHECK_INTERVAL, ); sst_worker.start_with_timer(sst_runner); } - let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let unified_read_pool = if self.core.config.readpool.is_unified_pool_enabled() { + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( - &self.config.readpool.unified, + &self.core.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + resource_ctl, + CleanupMethod::Remote(self.core.background_worker.remote()), )) } else { None }; + if let Some(unified_read_pool) = &unified_read_pool { + let handle = unified_read_pool.handle(); + self.core.background_worker.spawn_interval_task( + UPDATE_EWMA_TIME_SLICE_INTERVAL, + move || { + handle.update_ewma_time_slice(); + }, + ); + } // The `DebugService` and `DiagnosticsService` will share the same thread pool let props = tikv_util::thread_group::current_properties(); @@ -622,35 +532,37 @@ impl TiKvServer { Builder::new_multi_thread() .thread_name(thd_name!("debugger")) .worker_threads(1) - .on_thread_start(move || { + .after_start_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); tikv_util::thread_group::set_properties(props.clone()); }) - .on_thread_stop(tikv_alloc::remove_thread_memory_accessor) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) .build() .unwrap(), ); // Start resource metering. let (recorder_notifier, collector_reg_handle, resource_tag_factory, recorder_worker) = - resource_metering::init_recorder(self.config.resource_metering.precision.as_millis()); - self.to_stop.push(recorder_worker); + resource_metering::init_recorder( + self.core.config.resource_metering.precision.as_millis(), + ); + self.core.to_stop.push(recorder_worker); let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( - self.config.resource_metering.clone(), + self.core.config.resource_metering.clone(), collector_reg_handle.clone(), ); - self.to_stop.push(reporter_worker); + self.core.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( - self.config.resource_metering.receiver_address.clone(), + self.core.config.resource_metering.receiver_address.clone(), self.env.clone(), data_sink_reg_handle.clone(), ); - self.to_stop.push(single_target_worker); + self.core.to_stop.push(single_target_worker); let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( - self.config.resource_metering.clone(), + self.core.config.resource_metering.clone(), recorder_notifier, reporter_notifier, address_change_notifier, @@ -660,20 +572,20 @@ impl TiKvServer { Box::new(cfg_manager), ); - let storage_read_pool_handle = if self.config.readpool.storage.use_unified_pool() { + let storage_read_pool_handle = if self.core.config.readpool.storage.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { let storage_read_pools = ReadPool::from(storage::build_read_pool( - &self.config.readpool.storage, + &self.core.config.readpool.storage, pd_sender.clone(), engines.engine.clone(), )); storage_read_pools.handle() }; - let storage = create_raft_storage::<_, _, _, F>( + let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), - &self.config.storage, + &self.core.config.storage, storage_read_pool_handle, lock_mgr.clone(), self.concurrency_manager.clone(), @@ -683,81 +595,85 @@ impl TiKvServer { resource_tag_factory.clone(), Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), + self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( - self.engines.as_ref().unwrap().engine.kv_engine(), - self.config.storage.block_cache.shared, + self.tablet_registry.as_ref().unwrap().clone(), ttl_scheduler, flow_controller, storage.get_scheduler(), )), ); + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.core.background_worker, + storage.get_engine().raft_extension(), + ); + self.resolver = Some(resolver); + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) .register(self.coprocessor_host.as_mut().unwrap()); // Create snapshot manager, server. let snap_path = self + .core .store_path .join(Path::new("snap")) .to_str() .unwrap() .to_owned(); - let bps = i64::try_from(self.config.server.snap_max_write_bytes_per_sec.0) - .unwrap_or_else(|_| fatal!("snap_max_write_bytes_per_sec > i64::max_value")); + let bps = i64::try_from(self.core.config.server.snap_io_max_bytes_per_sec.0) + .unwrap_or_else(|_| fatal!("snap_io_max_bytes_per_sec > i64::max_value")); let snap_mgr = SnapManagerBuilder::default() .max_write_bytes_per_sec(bps) - .max_total_size(self.config.server.snap_max_total_size.0) - .encryption_key_manager(self.encryption_key_manager.clone()) - .max_per_file_size(self.config.raft_store.max_snapshot_file_raw_size.0) + .max_total_size(self.core.config.server.snap_max_total_size.0) + .encryption_key_manager(self.core.encryption_key_manager.clone()) + .max_per_file_size(self.core.config.raft_store.max_snapshot_file_raw_size.0) .enable_multi_snapshot_files( self.pd_client .feature_gate() .can_enable(MULTI_FILES_SNAPSHOT_FEATURE), ) + .enable_receive_tablet_snapshot( + self.core.config.raft_store.enable_v2_compatible_learner, + ) .build(snap_path); // Create coprocessor endpoint. - let cop_read_pool_handle = if self.config.readpool.coprocessor.use_unified_pool() { + let cop_read_pool_handle = if self.core.config.readpool.coprocessor.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { let cop_read_pools = ReadPool::from(coprocessor::readpool_impl::build_read_pool( - &self.config.readpool.coprocessor, + &self.core.config.readpool.coprocessor, pd_sender, engines.engine.clone(), )); cop_read_pools.handle() }; - if self.config.readpool.is_unified_pool_enabled() { + let mut unified_read_pool_scale_receiver = None; + if self.core.config.readpool.is_unified_pool_enabled() { + let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); cfg_controller.register( tikv::config::Module::Readpool, - Box::new(ReadPoolConfigManager( + Box::new(ReadPoolConfigManager::new( unified_read_pool.as_ref().unwrap().handle(), + unified_read_pool_scale_notifier, + &self.core.background_worker, + self.core.config.readpool.unified.max_thread_count, + self.core.config.readpool.unified.auto_adjust_pool_size, )), ); - } - - // Register causal observer for RawKV API V2 - if let ApiVersion::V2 = F::TAG { - let tso = block_on(causal_ts::BatchTsoProvider::new_opt( - self.pd_client.clone(), - self.config.causal_ts.renew_interval.0, - self.config.causal_ts.renew_batch_min_size, - )); - if let Err(e) = tso { - panic!("Causal timestamp provider initialize failed: {:?}", e); - } - let causal_ts_provider = Arc::new(tso.unwrap()); - info!("Causal timestamp provider startup."); - - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider); - causal_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + unified_read_pool_scale_receiver = Some(rx); } // Register cdc. @@ -765,12 +681,12 @@ impl TiKvServer { cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); // Register cdc config manager. cfg_controller.register( - tikv::config::Module::CDC, + tikv::config::Module::Cdc, Box::new(CdcConfigManager(cdc_worker.scheduler())), ); // Create resolved ts worker - let rts_worker = if self.config.resolved_ts.enable { + let rts_worker = if self.core.config.resolved_ts.enable { let worker = Box::new(LazyWorker::new("resolved-ts")); // Register the resolved ts observer let resolved_ts_ob = resolved_ts::Observer::new(worker.scheduler()); @@ -787,32 +703,37 @@ impl TiKvServer { None }; - let check_leader_runner = CheckLeaderRunner::new(engines.store_meta.clone()); + let check_leader_runner = CheckLeaderRunner::new( + engines.store_meta.clone(), + self.coprocessor_host.clone().unwrap(), + ); let check_leader_scheduler = self - .background_worker + .check_leader_worker .start("check-leader", check_leader_runner); - let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); + let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); - self.config + self.core + .config .raft_store .validate( - self.config.coprocessor.region_split_size, - self.config.coprocessor.enable_region_bucket, - self.config.coprocessor.region_bucket_size, + self.core.config.coprocessor.region_split_size(), + self.core.config.coprocessor.enable_region_bucket(), + self.core.config.coprocessor.region_bucket_size, ) .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); - let raft_store = Arc::new(VersionTrack::new(self.config.raft_store.clone())); + let raft_store = Arc::new(VersionTrack::new(self.core.config.raft_store.clone())); let health_service = HealthService::default(); let mut node = Node::new( self.system.take().unwrap(), &server_config.value().clone(), raft_store.clone(), - self.config.storage.api_version(), + self.core.config.storage.api_version(), self.pd_client.clone(), - self.state.clone(), - self.background_worker.clone(), + state, + self.core.background_worker.clone(), Some(health_service.clone()), + None, ); node.try_bootstrap_store(engines.engines.clone()) .unwrap_or_else(|e| fatal!("failed to bootstrap node id: {}", e)); @@ -829,18 +750,18 @@ impl TiKvServer { cop_read_pool_handle, self.concurrency_manager.clone(), resource_tag_factory, - Arc::clone(&self.quota_limiter), + self.quota_limiter.clone(), ), - coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), - self.router.clone(), - self.resolver.clone(), - snap_mgr.clone(), + coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), + self.resolver.clone().unwrap(), + Either::Left(snap_mgr.clone()), gc_worker.clone(), check_leader_scheduler, self.env.clone(), unified_read_pool, debug_thread_pool, health_service, + self.resource_manager.clone(), ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); cfg_controller.register( @@ -853,7 +774,7 @@ impl TiKvServer { ); // Start backup stream - if self.config.backup_stream.enable { + let backup_stream_scheduler = if self.core.config.log_backup.enable { // Create backup stream. let mut backup_stream_worker = Box::new(LazyWorker::new("backup-stream")); let backup_stream_scheduler = backup_stream_worker.scheduler(); @@ -864,48 +785,73 @@ impl TiKvServer { // Register config manager. cfg_controller.register( tikv::config::Module::BackupStream, - Box::new(BackupStreamConfigManager(backup_stream_worker.scheduler())), + Box::new(BackupStreamConfigManager::new( + backup_stream_worker.scheduler(), + self.core.config.log_backup.clone(), + )), ); - let etcd_cli = LazyEtcdClient::new( - self.config.pd.endpoints.as_slice(), - ConnectionConfig { - keep_alive_interval: self.config.server.grpc_keepalive_time.0, - keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: self.security_mgr.tonic_tls_config(), - }, + let region_read_progress = engines + .store_meta + .lock() + .unwrap() + .region_read_progress + .clone(); + let leadership_resolver = LeadershipResolver::new( + node.id(), + self.pd_client.clone(), + self.env.clone(), + self.security_mgr.clone(), + region_read_progress, + Duration::from_secs(60), ); + let backup_stream_endpoint = backup_stream::Endpoint::new( node.id(), - etcd_cli, - self.config.backup_stream.clone(), - backup_stream_scheduler, + PdStore::new(Checked::new(Sourced::new( + Arc::clone(&self.pd_client), + pd_client::meta_storage::Source::LogBackup, + ))), + self.core.config.log_backup.clone(), + backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.clone(), - self.router.clone(), + CdcRaftRouter(self.router.clone()), self.pd_client.clone(), self.concurrency_manager.clone(), + BackupStreamResolver::V1(leadership_resolver), ); backup_stream_worker.start(backup_stream_endpoint); - self.to_stop.push(backup_stream_worker); - } + self.core.to_stop.push(backup_stream_worker); + Some(backup_stream_scheduler) + } else { + None + }; - let import_path = self.store_path.join("import"); + let import_path = self.core.store_path.join("import"); let mut importer = SstImporter::new( - &self.config.import, + &self.core.config.import, import_path, - self.encryption_key_manager.clone(), - self.config.storage.api_version(), + self.core.encryption_key_manager.clone(), + self.core.config.storage.api_version(), ) .unwrap(); for (cf_name, compression_type) in &[ ( CF_DEFAULT, - self.config.rocksdb.defaultcf.bottommost_level_compression, + self.core + .config + .rocksdb + .defaultcf + .bottommost_level_compression, ), ( CF_WRITE, - self.config.rocksdb.writecf.bottommost_level_compression, + self.core + .config + .rocksdb + .writecf + .bottommost_level_compression, ), ] { importer.set_compression_type(cf_name, from_rocks_compression_type(*compression_type)); @@ -918,6 +864,7 @@ impl TiKvServer { self.coprocessor_host.clone().unwrap(), ); let split_check_scheduler = self + .core .background_worker .start("split-check", split_check_runner); cfg_controller.register( @@ -926,17 +873,22 @@ impl TiKvServer { ); let split_config_manager = - SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); + SplitConfigManager::new(Arc::new(VersionTrack::new(self.core.config.split.clone()))); cfg_controller.register( tikv::config::Module::Split, Box::new(split_config_manager.clone()), ); - let auto_split_controller = AutoSplitController::new(split_config_manager); + let auto_split_controller = AutoSplitController::new( + split_config_manager, + self.core.config.server.grpc_concurrency, + self.core.config.readpool.unified.max_thread_count, + unified_read_pool_scale_receiver, + ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); - let observer = match self.config.coprocessor.consistency_check_method { + let observer = match self.core.config.coprocessor.consistency_check_method { ConsistencyCheckMethod::Mvcc => BoxConsistencyCheckObserver::new( MvccConsistencyCheckObserver::new(safe_point.clone()), ), @@ -962,66 +914,71 @@ impl TiKvServer { auto_split_controller, self.concurrency_manager.clone(), collector_reg_handle, + self.causal_ts_provider.clone(), ) .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); - // Start auto gc. Must after `Node::start` because `node_id` is initialized there. + // Start auto gc. Must after `Node::start` because `node_id` is initialized + // there. assert!(node.id() > 0); // Node id should never be 0. let auto_gc_config = AutoGcConfig::new( self.pd_client.clone(), self.region_info_accessor.clone(), node.id(), ); + gc_worker + .start(node.id()) + .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { fatal!("failed to start auto_gc on storage, error: {}", e); } - initial_metric(&self.config.metric); - if self.config.storage.enable_ttl { + initial_metric(&self.core.config.metric); + if self.core.config.storage.enable_ttl { ttl_checker.start_with_timer(TtlChecker::new( - self.engines.as_ref().unwrap().engine.kv_engine(), + self.engines.as_ref().unwrap().engine.kv_engine().unwrap(), self.region_info_accessor.clone(), - self.config.storage.ttl_check_poll_interval.into(), + self.core.config.storage.ttl_check_poll_interval.into(), )); - self.to_stop.push(ttl_checker); + self.core.to_stop.push(ttl_checker); } // Start CDC. - let cdc_memory_quota = MemoryQuota::new(self.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); let cdc_endpoint = cdc::Endpoint::new( - self.config.server.cluster_id, - &self.config.cdc, - self.config.storage.api_version(), + self.core.config.server.cluster_id, + &self.core.config.cdc, + self.core.config.storage.engine == EngineType::RaftKv2, + self.core.config.storage.api_version(), self.pd_client.clone(), cdc_scheduler.clone(), - self.router.clone(), - self.engines.as_ref().unwrap().engines.kv.clone(), + CdcRaftRouter(self.router.clone()), + LocalTablets::Singleton(self.engines.as_ref().unwrap().engines.kv.clone()), cdc_ob, engines.store_meta.clone(), self.concurrency_manager.clone(), server.env(), self.security_mgr.clone(), cdc_memory_quota.clone(), + self.causal_ts_provider.clone(), ); cdc_worker.start_with_timer(cdc_endpoint); - self.to_stop.push(cdc_worker); + self.core.to_stop.push(cdc_worker); // Start resolved ts if let Some(mut rts_worker) = rts_worker { let rts_endpoint = resolved_ts::Endpoint::new( - &self.config.resolved_ts, + &self.core.config.resolved_ts, rts_worker.scheduler(), - self.router.clone(), + CdcRaftRouter(self.router.clone()), engines.store_meta.clone(), self.pd_client.clone(), self.concurrency_manager.clone(), server.env(), self.security_mgr.clone(), - // TODO: replace to the cdc sinker - resolved_ts::DummySinker::new(), ); rts_worker.start_with_timer(rts_endpoint); - self.to_stop.push(rts_worker); + self.core.to_stop.push(rts_worker); } cfg_controller.register( @@ -1040,6 +997,7 @@ impl TiKvServer { cdc_scheduler, cdc_memory_quota, rsmeter_pubsub_service, + backup_stream_scheduler, }); server_config @@ -1051,12 +1009,14 @@ impl TiKvServer { // Import SST service. let import_service = ImportSstService::new( - self.config.import.clone(), - self.config.raft_store.raft_entry_max_size, - self.router.clone(), - engines.engines.kv.clone(), + self.core.config.import.clone(), + self.core.config.raft_store.raft_entry_max_size, + engines.engine.clone(), + LocalTablets::Singleton(engines.engines.kv.clone()), servers.importer.clone(), ); + let import_cfg_mgr = import_service.get_config_manager(); + if servers .server .register_service(create_import_sst(import_service)) @@ -1065,11 +1025,18 @@ impl TiKvServer { fatal!("failed to register import service"); } + self.cfg_controller + .as_mut() + .unwrap() + .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + // Debug service. let debug_service = DebugService::new( engines.engines.clone(), + self.kv_statistics.clone(), + self.raft_statistics.clone(), servers.server.get_debug_thread_pool().clone(), - self.router.clone(), + engines.engine.raft_extension(), self.cfg_controller.as_ref().unwrap().clone(), ); if servers @@ -1083,8 +1050,8 @@ impl TiKvServer { // Create Diagnostics service let diag_service = DiagnosticsService::new( servers.server.get_debug_thread_pool().clone(), - self.config.log.file.filename.clone(), - self.config.slow_log_file.clone(), + self.core.config.log.file.filename.clone(), + self.core.config.slow_log_file.clone(), ); if servers .server @@ -1108,16 +1075,17 @@ impl TiKvServer { .start( servers.node.id(), self.pd_client.clone(), - self.resolver.clone(), + self.resolver.clone().unwrap(), self.security_mgr.clone(), - &self.config.pessimistic_txn, + &self.core.config.pessimistic_txn, ) .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); // Backup service. - let mut backup_worker = Box::new(self.background_worker.lazy_build("backup-endpoint")); + let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); - let backup_service = backup::Service::new(backup_scheduler); + let backup_service = + backup::Service::::with_router(backup_scheduler, self.router.clone()); if servers .server .register_service(create_backup(backup_service)) @@ -1130,10 +1098,11 @@ impl TiKvServer { servers.node.id(), engines.engine.clone(), self.region_info_accessor.clone(), - engines.engines.kv.as_inner().clone(), - self.config.backup.clone(), + LocalTablets::Singleton(engines.engines.kv.clone()), + self.core.config.backup.clone(), self.concurrency_manager.clone(), - self.config.storage.api_version(), + self.core.config.storage.api_version(), + self.causal_ts_provider.clone(), ); self.cfg_controller.as_mut().unwrap().register( tikv::config::Module::Backup, @@ -1161,28 +1130,31 @@ impl TiKvServer { { warn!("failed to register resource metering pubsub service"); } - } - fn init_io_utility(&mut self) -> BytesFetcher { - let stats_collector_enabled = file_system::init_io_stats_collector() - .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) - .is_ok(); + if let Some(sched) = servers.backup_stream_scheduler.take() { + let pitr_service = backup_stream::Service::new(sched); + if servers + .server + .register_service(create_log_backup(pitr_service)) + .is_some() + { + fatal!("failed to register log backup service"); + } + } - let limiter = Arc::new( - self.config - .storage - .io_rate_limit - .build(!stats_collector_enabled /*enable_statistics*/), - ); - let fetcher = if stats_collector_enabled { - BytesFetcher::FromIOStatsCollector() - } else { - BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) - }; - // Set up IO limiter even when rate limit is disabled, so that rate limits can be - // dynamically applied later on. - set_io_rate_limiter(Some(limiter)); - fetcher + // the present tikv in recovery mode, start recovery service + if self.br_snap_recovery_mode { + let recovery_service = + RecoveryService::new(engines.engines.clone(), self.router.clone()); + + if servers + .server + .register_service(create_recover_data(recovery_service)) + .is_some() + { + fatal!("failed to register recovery service"); + } + } } fn init_metrics_flusher( @@ -1191,17 +1163,30 @@ impl TiKvServer { engines_info: Arc, ) { let mut engine_metrics = EngineMetricsManager::::new( - self.engines.as_ref().unwrap().engines.clone(), + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.core.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().engines.raft.clone(), + self.raft_statistics.clone(), ); - let mut io_metrics = IOMetricsManager::new(fetcher); + let mut io_metrics = IoMetricsManager::new(fetcher); let engines_info_clone = engines_info.clone(); - self.background_worker - .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { + + // region_id -> (suffix, tablet) + // `update` of EnginesResourceInfo is called perodically which needs this map + // for recording the latest tablet for each region. + // `cached_latest_tablets` is passed to `update` to avoid memory + // allocation each time when calling `update`. + let mut cached_latest_tablets = HashMap::default(); + self.core.background_worker.spawn_interval_task( + DEFAULT_METRICS_FLUSH_INTERVAL, + move || { let now = Instant::now(); engine_metrics.flush(now); io_metrics.flush(now); - engines_info_clone.update(now); - }); + engines_info_clone.update(now, &mut cached_latest_tablets); + }, + ); if let Some(limiter) = get_io_rate_limiter() { limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); } @@ -1209,27 +1194,44 @@ impl TiKvServer { let mut mem_trace_metrics = MemoryTraceManager::default(); mem_trace_metrics.register_provider(MEMTRACE_RAFTSTORE.clone()); mem_trace_metrics.register_provider(MEMTRACE_COPROCESSOR.clone()); - self.background_worker - .spawn_interval_task(DEFAULT_MEMTRACE_FLUSH_INTERVAL, move || { + self.core.background_worker.spawn_interval_task( + DEFAULT_MEMTRACE_FLUSH_INTERVAL, + move || { let now = Instant::now(); mem_trace_metrics.flush(now); - }); + }, + ); } fn init_storage_stats_task(&self, engines: Engines) { - let config_disk_capacity: u64 = self.config.raft_store.capacity.0; - let data_dir = self.config.storage.data_dir.clone(); - let store_path = self.store_path.clone(); + let config_disk_capacity: u64 = self.core.config.raft_store.capacity.0; + let data_dir = self.core.config.storage.data_dir.clone(); + let store_path = self.core.store_path.clone(); let snap_mgr = self.snap_mgr.clone().unwrap(); let reserve_space = disk::get_disk_reserved_space(); - if reserve_space == 0 { + let reserve_raft_space = disk::get_raft_disk_reserved_space(); + if reserve_space == 0 && reserve_raft_space == 0 { info!("disk space checker not enabled"); return; } + let raft_path = engines.raft.get_engine_path().to_string(); + let separated_raft_mount_path = + path_in_diff_mount_point(raft_path.as_str(), engines.kv.path()); + let raft_almost_full_threshold = reserve_raft_space; + let raft_already_full_threshold = reserve_raft_space / 2; let almost_full_threshold = reserve_space; let already_full_threshold = reserve_space / 2; - self.background_worker + fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } + } + self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { let disk_stats = match fs2::statvfs(&store_path) { Err(e) => { @@ -1255,14 +1257,45 @@ impl TiKvServer { .get_engine_size() .expect("get raft engine size"); + let mut raft_disk_status = disk::DiskUsage::Normal; + if separated_raft_mount_path && reserve_raft_space != 0 { + let raft_disk_stats = match fs2::statvfs(&raft_path) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft engine path" => raft_path.clone(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let raft_disk_cap = raft_disk_stats.total_space(); + let mut raft_disk_available = + raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); + raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); + raft_disk_status = if raft_disk_available <= raft_already_full_threshold + { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= raft_almost_full_threshold + { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); let placeholder_size: u64 = - file_system::get_file_size(&placeholer_file_path).unwrap_or(0); + file_system::get_file_size(placeholer_file_path).unwrap_or(0); - let used_size = snap_size + kv_size + raft_size + placeholder_size; + let used_size = if !separated_raft_mount_path { + snap_size + kv_size + raft_size + placeholder_size + } else { + snap_size + kv_size + placeholder_size + }; let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { disk_cap } else { @@ -1273,18 +1306,22 @@ impl TiKvServer { available = cmp::min(available, disk_stats.available_space()); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_disk_status = if available <= already_full_threshold { + let cur_kv_disk_status = if available <= already_full_threshold { disk::DiskUsage::AlreadyFull } else if available <= almost_full_threshold { disk::DiskUsage::AlmostFull } else { disk::DiskUsage::Normal }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( - "disk usage {:?}->{:?}, available={},snap={},kv={},raft={},capacity={}", + "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", prev_disk_status, cur_disk_status, + raft_disk_status, + cur_kv_disk_status, + separated_raft_mount_path, available, snap_size, kv_size, @@ -1298,6 +1335,7 @@ impl TiKvServer { fn init_sst_recovery_sender(&mut self) -> Option> { if !self + .core .config .storage .background_error_recovery_window @@ -1320,20 +1358,21 @@ impl TiKvServer { .unwrap_or_else(|e| fatal!("failed to build server: {}", e)); server .server - .start(server_config, self.security_mgr.clone()) + .start(server_config, self.security_mgr.clone(), NoSnapshotCache) .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); } fn run_status_server(&mut self) { // Create a status server. - let status_enabled = !self.config.server.status_addr.is_empty(); + let status_enabled = !self.core.config.server.status_addr.is_empty(); if status_enabled { let mut status_server = match StatusServer::new( - self.config.server.status_thread_pool_size, + self.core.config.server.status_thread_pool_size, self.cfg_controller.take().unwrap(), - Arc::new(self.config.security.clone()), - self.router.clone(), - self.store_path.clone(), + Arc::new(self.core.config.security.clone()), + self.engines.as_ref().unwrap().engine.raft_extension(), + self.core.store_path.clone(), + self.resource_manager.clone(), ) { Ok(status_server) => Box::new(status_server), Err(e) => { @@ -1342,10 +1381,10 @@ impl TiKvServer { } }; // Start the status server. - if let Err(e) = status_server.start(self.config.server.status_addr.clone()) { + if let Err(e) = status_server.start(self.core.config.server.status_addr.clone()) { error_unknown!(%e; "failed to bind addr for status service"); } else { - self.to_stop.push(status_server); + self.core.to_stop.push(status_server); } } } @@ -1367,157 +1406,74 @@ impl TiKvServer { sst_worker.stop_worker(); } - self.to_stop.into_iter().for_each(|s| s.stop()); - } -} - -pub trait ConfiguredRaftEngine: RaftEngine { - fn build( - _: &TiKvConfig, - _: &Arc, - _: &Option>, - _: &Option, - ) -> Self; - fn as_rocks_engine(&self) -> Option<&RocksEngine> { - None + self.core.to_stop.into_iter().for_each(|s| s.stop()); } - fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} } -impl ConfiguredRaftEngine for RocksEngine { - fn build( - config: &TiKvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Option, - ) -> Self { - let mut raft_data_state_machine = RaftDataStateMachine::new( - &config.storage.data_dir, - &config.raft_engine.config().dir, - &config.raft_store.raftdb_path, - ); - let should_dump = raft_data_state_machine.before_open_target(); - - let raft_db_path = &config.raft_store.raftdb_path; - let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); - let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = - engine_rocks::raw_util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) - .expect("failed to open raftdb"); - let mut raftdb = RocksEngine::from_db(Arc::new(raftdb)); - raftdb.set_shared_block_cache(block_cache.is_some()); - - if should_dump { - let raft_engine = - RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) - .expect("failed to open raft engine for migration"); - dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /*threads*/); - raft_data_state_machine.after_dump_data(); - } - raftdb - } - - fn as_rocks_engine(&self) -> Option<&RocksEngine> { - Some(self) - } - - fn register_config(&self, cfg_controller: &mut ConfigController, share_cache: bool) { - cfg_controller.register( - tikv::config::Module::Raftdb, - Box::new(DBConfigManger::new(self.clone(), DBType::Raft, share_cache)), - ); - } -} - -impl ConfiguredRaftEngine for RaftLogEngine { - fn build( - config: &TiKvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Option, - ) -> Self { - let mut raft_data_state_machine = RaftDataStateMachine::new( - &config.storage.data_dir, - &config.raft_store.raftdb_path, - &config.raft_engine.config().dir, - ); - let should_dump = raft_data_state_machine.before_open_target(); - - let raft_config = config.raft_engine.config(); - let raft_engine = - RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) - .expect("failed to open raft engine"); - - if should_dump { - let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); - let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = engine_rocks::raw_util::new_engine_opt( - &config.raft_store.raftdb_path, - raft_db_opts, - raft_cf_opts, - ) - .expect("failed to open raftdb for migration"); - let raftdb = RocksEngine::from_db(Arc::new(raftdb)); - dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /*threads*/); - raft_data_state_machine.after_dump_data(); - } - raft_engine - } -} - -impl TiKvServer { +impl TikvServer { fn init_raw_engines( &mut self, flow_listener: engine_rocks::FlowListener, ) -> (Engines, Arc) { - let block_cache = self.config.storage.block_cache.build_shared_cache(); + let block_cache = self + .core + .config + .storage + .block_cache + .build_shared_cache(self.core.config.storage.engine); let env = self + .core .config - .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) + .build_shared_rocks_env( + self.core.encryption_key_manager.clone(), + get_io_rate_limiter(), + ) .unwrap(); // Create raft engine - let raft_engine = CER::build( - &self.config, + let (raft_engine, raft_statistics) = CER::build( + &self.core.config, &env, - &self.encryption_key_manager, + &self.core.encryption_key_manager, &block_cache, ); + self.raft_statistics = raft_statistics; // Create kv engine. - let mut builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path) - .compaction_filter_router(self.router.clone()) + let builder = KvEngineFactoryBuilder::new(env, &self.core.config, block_cache) + .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { + router: Mutex::new(self.router.clone()), + })) .region_info_accessor(self.region_info_accessor.clone()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); - if let Some(cache) = block_cache { - builder = builder.block_cache(cache); - } - let factory = builder.build(); + let factory = Box::new(builder.build()); let kv_engine = factory - .create_tablet() + .create_shared_db(&self.core.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); - let engines = Engines::new(kv_engine, raft_engine); + self.kv_statistics = Some(factory.rocks_statistics()); + let engines = Engines::new(kv_engine.clone(), raft_engine); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DBConfigManger::new( - engines.kv.clone(), - DBType::Kv, - self.config.storage.block_cache.shared, - )), + Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), ); - engines - .raft - .register_config(cfg_controller, self.config.storage.block_cache.shared); + let reg = TabletRegistry::new( + Box::new(SingletonFactory::new(kv_engine)), + &self.core.store_path, + ) + .unwrap(); + // It always use the singleton kv_engine, use arbitrary id and suffix. + let ctx = TabletContext::with_infinite_region(0, Some(0)); + reg.load(ctx, false).unwrap(); + self.tablet_registry = Some(reg.clone()); + engines.raft.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - &engines, 180, /*max_samples_to_preserve*/ + reg, + engines.raft.as_rocks_engine().cloned(), + 180, // max_samples_to_preserve )); (engines, engines_info) @@ -1553,190 +1509,89 @@ fn pre_start() { } } -fn check_system_config(config: &TiKvConfig) { - info!("beginning system configuration check"); - let mut rocksdb_max_open_files = config.rocksdb.max_open_files; - if config.rocksdb.titan.enabled { - // Titan engine maintains yet another pool of blob files and uses the same max - // number of open files setup as rocksdb does. So we double the max required - // open files here - rocksdb_max_open_files *= 2; - } - if let Err(e) = tikv_util::config::check_max_open_fds( - RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, - ) { - fatal!("{}", e); - } - - // Check RocksDB data dir - if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { - warn!( - "check: rocksdb-data-dir"; - "path" => &config.storage.data_dir, - "err" => %e - ); - } - // Check raft data dir - if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { - warn!( - "check: raftdb-path"; - "path" => &config.raft_store.raftdb_path, - "err" => %e - ); - } -} - -fn try_lock_conflict_addr>(path: P) -> File { - let f = File::create(path.as_ref()).unwrap_or_else(|e| { - fatal!( - "failed to create lock at {}: {}", - path.as_ref().display(), - e - ) - }); - - if f.try_lock_exclusive().is_err() { - fatal!( - "{} already in use, maybe another instance is binding with this address.", - path.as_ref().file_name().unwrap().to_str().unwrap() - ); - } - f -} - -#[cfg(unix)] -fn get_lock_dir() -> String { - format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) -} - -#[cfg(not(unix))] -fn get_lock_dir() -> String { - "TIKV_LOCK_FILES".to_owned() -} - -/// A small trait for components which can be trivially stopped. Lets us keep -/// a list of these in `TiKV`, rather than storing each component individually. -trait Stop { - fn stop(self: Box); -} +#[cfg(test)] +mod test { + use std::{collections::HashMap, sync::Arc}; + + use engine_rocks::raw::Env; + use engine_traits::{ + FlowControlFactorsExt, MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, + }; + use tempfile::Builder; + use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; + use tikv_util::{config::ReadableSize, time::Instant}; + + use super::EnginesResourceInfo; + + #[test] + fn test_engines_resource_info_update() { + let mut config = TikvConfig::default(); + config.rocksdb.defaultcf.disable_auto_compactions = true; + config.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.writecf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + let env = Arc::new(Env::default()); + let path = Builder::new().prefix("test-update").tempdir().unwrap(); + let cache = config + .storage + .block_cache + .build_shared_cache(config.storage.engine); -impl Stop for StatusServer -where - E: 'static, - R: 'static + Send, -{ - fn stop(self: Box) { - (*self).stop() - } -} + let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); -impl Stop for Worker { - fn stop(self: Box) { - Worker::stop(&self); - } -} + for i in 1..6 { + let ctx = TabletContext::with_infinite_region(i, Some(10)); + reg.load(ctx, true).unwrap(); + } -impl Stop for LazyWorker { - fn stop(self: Box) { - self.stop_worker(); - } -} + let mut cached = reg.get(1).unwrap(); + let mut tablet = cached.latest().unwrap(); + // Prepare some data for two tablets of the same region. So we can test whether + // we fetch the bytes from the latest one. + for i in 1..21 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let old_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); -pub struct EngineMetricsManager { - engines: Engines, - last_reset: Instant, -} + let ctx = TabletContext::with_infinite_region(1, Some(20)); + reg.load(ctx, true).unwrap(); + tablet = cached.latest().unwrap(); -impl EngineMetricsManager { - pub fn new(engines: Engines) -> Self { - EngineMetricsManager { - engines, - last_reset: Instant::now(), + for i in 1..11 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } } - } + let new_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); - pub fn flush(&mut self, now: Instant) { - KvEngine::flush_metrics(&self.engines.kv, "kv"); - self.engines.raft.flush_metrics("raft"); - if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { - KvEngine::reset_statistics(&self.engines.kv); - self.engines.raft.reset_statistics(); - self.last_reset = now; - } - } -} + assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); -pub struct EnginesResourceInfo { - kv_engine: RocksEngine, - raft_engine: Option, - latest_normalized_pending_bytes: AtomicU32, - normalized_pending_bytes_collector: MovingAvgU32, -} + let engines_info = Arc::new(EnginesResourceInfo::new(reg, None, 10)); -impl EnginesResourceInfo { - const SCALE_FACTOR: u64 = 100; - - fn new( - engines: &Engines, - max_samples_to_preserve: usize, - ) -> Self { - let raft_engine = engines.raft.as_rocks_engine().cloned(); - EnginesResourceInfo { - kv_engine: engines.kv.clone(), - raft_engine, - latest_normalized_pending_bytes: AtomicU32::new(0), - normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), - } - } + let mut cached_latest_tablets = HashMap::default(); + engines_info.update(Instant::now(), &mut cached_latest_tablets); - pub fn update(&self, _now: Instant) { - let mut normalized_pending_bytes = 0; - - fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { - if let Ok(cf_opts) = engine.get_options_cf(cf) { - if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { - if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { - *normalized_pending_bytes = std::cmp::max( - *normalized_pending_bytes, - (b * EnginesResourceInfo::SCALE_FACTOR - / cf_opts.get_soft_pending_compaction_bytes_limit()) - as u32, - ); - } - } - } - } + // The memory allocation should be reserved + assert!(cached_latest_tablets.capacity() >= 5); + // The tablet cache should be cleared + assert!(cached_latest_tablets.is_empty()); - if let Some(raft_engine) = &self.raft_engine { - fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); - } - for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { - fetch_engine_cf(&self.kv_engine, cf, &mut normalized_pending_bytes); - } - let (_, avg) = self - .normalized_pending_bytes_collector - .add(normalized_pending_bytes); - self.latest_normalized_pending_bytes.store( - std::cmp::max(normalized_pending_bytes, avg), - Ordering::Relaxed, + // The latest_normalized_pending_bytes should be equal to the pending compaction + // bytes of tablet_1_20 + assert_eq!( + (new_pending_compaction_bytes * 100) as u32, + engines_info.latest_normalized_pending_bytes() ); } } - -impl IOBudgetAdjustor for EnginesResourceInfo { - fn adjust(&self, total_budgets: usize) -> usize { - let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 - / Self::SCALE_FACTOR as f32; - // Two reasons for adding `sqrt` on top: - // 1) In theory the convergence point is independent of the value of pending - // bytes (as long as backlog generating rate equals consuming rate, which is - // determined by compaction budgets), a convex helps reach that point while - // maintaining low level of pending bytes. - // 2) Variance of compaction pending bytes grows with its magnitude, a filter - // with decreasing derivative can help balance such trend. - let score = score.sqrt(); - // The target global write flow slides between Bandwidth / 2 and Bandwidth. - let score = 0.5 + score / 2.0; - (total_budgets as f32 * score) as usize - } -} diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs new file mode 100644 index 00000000000..4d1a9f2daf6 --- /dev/null +++ b/components/server/src/server2.rs @@ -0,0 +1,1436 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module startups all the components of a TiKV server. +//! +//! It is responsible for reading from configs, starting up the various server +//! components, and handling errors (mostly by aborting and reporting to the +//! user). +//! +//! The entry point is `run_tikv`. +//! +//! Components are often used to initialize other components, and/or must be +//! explicitly stopped. We keep these components in the `TikvServer` struct. + +use std::{ + cmp, + collections::HashMap, + marker::PhantomData, + path::{Path, PathBuf}, + str::FromStr, + sync::{atomic::AtomicU64, mpsc, Arc}, + time::Duration, + u64, +}; + +use api_version::{dispatch_api_version, KvFormat}; +use backup_stream::{ + config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, + BackupStreamResolver, +}; +use causal_ts::CausalTsProviderImpl; +use cdc::{CdcConfigManager, MemoryQuota}; +use concurrency_manager::ConcurrencyManager; +use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; +use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine, TabletRegistry, CF_DEFAULT, CF_WRITE}; +use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetricsManager}; +use futures::executor::block_on; +use grpcio::{EnvBuilder, Environment}; +use grpcio_health::HealthService; +use kvproto::{ + brpb::create_backup, cdcpb_grpc::create_change_data, deadlock::create_deadlock, + diagnosticspb::create_diagnostics, import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, + logbackuppb::create_log_backup, resource_usage_agent::create_resource_metering_pub_sub, +}; +use pd_client::{ + meta_storage::{Checked, Sourced}, + PdClient, RpcClient, +}; +use raft_log_engine::RaftLogEngine; +use raftstore::{ + coprocessor::{ + BoxConsistencyCheckObserver, ConsistencyCheckMethod, CoprocessorHost, + RawConsistencyCheckObserver, + }, + store::{ + memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, AutoSplitController, CheckLeaderRunner, + SplitConfigManager, TabletSnapManager, + }, + RegionInfoAccessor, +}; +use raftstore_v2::{router::RaftRouter, StateStorage}; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; +use security::SecurityManager; +use tikv::{ + config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, + coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, + coprocessor_v2, + import::{ImportSstService, SstImporter}, + read_pool::{ + build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, + }, + server::{ + config::{Config as ServerConfig, ServerConfigManager}, + gc_worker::{AutoGcConfig, GcWorker}, + lock_manager::LockManager, + raftkv::ReplicaReadLockChecker, + resolve, + service::DiagnosticsService, + status_server::StatusServer, + KvEngineFactoryBuilder, NodeV2, RaftKv2, Server, CPU_CORES_QUOTA_GAUGE, GRPC_THREAD_PREFIX, + }, + storage::{ + self, + config::EngineType, + config_manager::StorageConfigManger, + kv::LocalTablets, + mvcc::MvccConsistencyCheckObserver, + txn::flow_controller::{FlowController, TabletFlowController}, + Engine, Storage, + }, +}; +use tikv_util::{ + check_environment_variables, + config::VersionTrack, + quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, + sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, + thread_group::GroupProperties, + time::{Instant, Monitor}, + worker::{Builder as WorkerBuilder, LazyWorker, Scheduler}, + yatp_pool::CleanupMethod, + Either, +}; +use tokio::runtime::Builder; + +use crate::{ + common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + memory::*, + setup::*, + signal_handler, + tikv_util::sys::thread::ThreadBuildWrapper, +}; + +#[inline] +fn run_impl(config: TikvConfig) { + let mut tikv = TikvServer::::init::(config); + + // Must be called after `TikvServer::init`. + let memory_limit = tikv.core.config.memory_usage_limit.unwrap().0; + let high_water = (tikv.core.config.memory_usage_high_water * memory_limit as f64) as u64; + register_memory_usage_high_water(high_water); + + tikv.core.check_conflict_addr(); + tikv.core.init_fs(); + tikv.core.init_yatp(); + tikv.core.init_encryption(); + let fetcher = tikv.core.init_io_utility(); + let listener = tikv.core.init_flow_receiver(); + let engines_info = tikv.init_engines(listener); + let server_config = tikv.init_servers::(); + tikv.register_services(); + tikv.init_metrics_flusher(fetcher, engines_info); + tikv.init_storage_stats_task(); + tikv.run_server(server_config); + tikv.run_status_server(); + tikv.core.init_quota_tuning_task(tikv.quota_limiter.clone()); + + // TODO: support signal dump stats + signal_handler::wait_for_signal( + None as Option>, + tikv.kv_statistics.clone(), + tikv.raft_statistics.clone(), + ); + tikv.stop(); +} + +/// Run a TiKV server. Returns when the server is shutdown by the user, in which +/// case the server will be properly stopped. +pub fn run_tikv(config: TikvConfig) { + // Sets the global logger ASAP. + // It is okay to use the config w/o `validate()`, + // because `initial_logger()` handles various conditions. + initial_logger(&config); + + // Print version information. + let build_timestamp = option_env!("TIKV_BUILD_TIME"); + tikv::log_tikv_info(build_timestamp); + + // Print resource quota. + SysQuota::log_quota(); + CPU_CORES_QUOTA_GAUGE.set(SysQuota::cpu_cores_quota()); + + // Do some prepare works before start. + pre_start(); + + let _m = Monitor::default(); + + dispatch_api_version!(config.storage.api_version(), { + if !config.raft_engine.enable { + run_impl::(config) + } else { + run_impl::(config) + } + }) +} + +const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); +const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); +const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); + +/// A complete TiKV server. +struct TikvServer { + core: TikvServerCore, + cfg_controller: Option, + security_mgr: Arc, + pd_client: Arc, + router: Option>, + node: Option>, + resolver: Option, + snap_mgr: Option, // Will be filled in `init_servers`. + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, + servers: Option>, + region_info_accessor: Option, + coprocessor_host: Option>, + concurrency_manager: ConcurrencyManager, + env: Arc, + cdc_worker: Option>>, + cdc_scheduler: Option>, + cdc_memory_quota: Option, + backup_stream_scheduler: Option>, + sst_worker: Option>>, + quota_limiter: Arc, + resource_manager: Option>, + causal_ts_provider: Option>, // used for rawkv apiv2 + tablet_registry: Option>, +} + +struct TikvEngines { + raft_engine: ER, + engine: RaftKv2, +} + +struct Servers { + lock_mgr: LockManager, + server: LocalServer, + importer: Arc, + rsmeter_pubsub_service: resource_metering::PubSubService, +} + +type LocalServer = Server>; + +impl TikvServer +where + ER: RaftEngine, +{ + fn init(mut config: TikvConfig) -> TikvServer { + tikv_util::thread_group::set_properties(Some(GroupProperties::default())); + // It is okay use pd config and security config before `init_config`, + // because these configs must be provided by command line, and only + // used during startup process. + let security_mgr = Arc::new( + SecurityManager::new(&config.security) + .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), + ); + let env = Arc::new( + EnvBuilder::new() + .cq_count(config.server.grpc_concurrency) + .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .build(), + ); + let pd_client = TikvServerCore::connect_to_pd_cluster( + &mut config, + env.clone(), + Arc::clone(&security_mgr), + ); + + // Initialize and check config + let cfg_controller = TikvServerCore::init_config(config); + let config = cfg_controller.get_current(); + + let store_path = Path::new(&config.storage.data_dir).to_owned(); + + let thread_count = config.server.background_thread_count; + let background_worker = WorkerBuilder::new("background") + .thread_count(thread_count) + .create(); + + // Initialize concurrency manager + let latest_ts = block_on(pd_client.get_tso()).expect("failed to get timestamp from PD"); + let concurrency_manager = ConcurrencyManager::new(latest_ts); + + // use different quota for front-end and back-end requests + let quota_limiter = Arc::new(QuotaLimiter::new( + config.quota.foreground_cpu_time, + config.quota.foreground_write_bandwidth, + config.quota.foreground_read_bandwidth, + config.quota.background_cpu_time, + config.quota.background_write_bandwidth, + config.quota.background_read_bandwidth, + config.quota.max_delay_duration, + config.quota.enable_auto_tune, + )); + + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); + // spawn a task to periodically update the minimal virtual time of all resource + // groups. + let resource_mgr = mgr.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; + }); + Some(mgr) + } else { + None + }; + + let mut causal_ts_provider = None; + if let ApiVersion::V2 = F::TAG { + let tso = block_on(causal_ts::BatchTsoProvider::new_opt( + pd_client.clone(), + config.causal_ts.renew_interval.0, + config.causal_ts.alloc_ahead_buffer.0, + config.causal_ts.renew_batch_min_size, + config.causal_ts.renew_batch_max_size, + )); + if let Err(e) = tso { + fatal!("Causal timestamp provider initialize failed: {:?}", e); + } + causal_ts_provider = Some(Arc::new(tso.unwrap().into())); + info!("Causal timestamp provider startup."); + } + + TikvServer { + core: TikvServerCore { + config, + store_path, + lock_files: vec![], + encryption_key_manager: None, + flow_info_sender: None, + flow_info_receiver: None, + to_stop: vec![], + background_worker, + }, + cfg_controller: Some(cfg_controller), + security_mgr, + pd_client, + router: None, + node: None, + resolver: None, + snap_mgr: None, + engines: None, + kv_statistics: None, + raft_statistics: None, + servers: None, + region_info_accessor: None, + coprocessor_host: None, + concurrency_manager, + env, + cdc_worker: None, + cdc_scheduler: None, + cdc_memory_quota: None, + backup_stream_scheduler: None, + sst_worker: None, + quota_limiter, + resource_manager, + causal_ts_provider, + tablet_registry: None, + } + } + + fn init_gc_worker(&mut self) -> GcWorker> { + let engines = self.engines.as_ref().unwrap(); + let gc_worker = GcWorker::new( + engines.engine.clone(), + self.core.flow_info_sender.take().unwrap(), + self.core.config.gc.clone(), + self.pd_client.feature_gate().clone(), + Arc::new(self.region_info_accessor.clone().unwrap()), + ); + + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + cfg_controller.register( + tikv::config::Module::Gc, + Box::new(gc_worker.get_config_manager()), + ); + + gc_worker + } + + fn init_servers(&mut self) -> Arc> { + let flow_controller = Arc::new(FlowController::Tablet(TabletFlowController::new( + &self.core.config.storage.flow_control, + self.tablet_registry.clone().unwrap(), + self.core.flow_info_receiver.take().unwrap(), + ))); + let mut gc_worker = self.init_gc_worker(); + let ttl_checker = Box::new(LazyWorker::new("ttl-checker")); + let ttl_scheduler = ttl_checker.scheduler(); + + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + + cfg_controller.register( + tikv::config::Module::Quota, + Box::new(QuotaLimitConfigManager::new(Arc::clone( + &self.quota_limiter, + ))), + ); + + cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + + let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); + cfg_controller.register( + tikv::config::Module::PessimisticTxn, + Box::new(lock_mgr.config_manager()), + ); + lock_mgr.register_detector_role_change_observer(self.coprocessor_host.as_mut().unwrap()); + + let engines = self.engines.as_mut().unwrap(); + + let pd_worker = LazyWorker::new("pd-worker"); + let pd_sender = raftstore_v2::PdReporter::new( + pd_worker.scheduler(), + slog_global::borrow_global().new(slog::o!()), + ); + + let unified_read_pool = if self.core.config.readpool.is_unified_pool_enabled() { + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); + Some(build_yatp_read_pool( + &self.core.config.readpool.unified, + pd_sender.clone(), + engines.engine.clone(), + resource_ctl, + CleanupMethod::Remote(self.core.background_worker.remote()), + )) + } else { + None + }; + if let Some(unified_read_pool) = &unified_read_pool { + let handle = unified_read_pool.handle(); + self.core.background_worker.spawn_interval_task( + UPDATE_EWMA_TIME_SLICE_INTERVAL, + move || { + handle.update_ewma_time_slice(); + }, + ); + } + + // The `DebugService` and `DiagnosticsService` will share the same thread pool + let props = tikv_util::thread_group::current_properties(); + let debug_thread_pool = Arc::new( + Builder::new_multi_thread() + .thread_name(thd_name!("debugger")) + .worker_threads(1) + .after_start_wrapper(move || { + tikv_alloc::add_thread_memory_accessor(); + tikv_util::thread_group::set_properties(props.clone()); + }) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) + .build() + .unwrap(), + ); + + // Start resource metering. + let (recorder_notifier, collector_reg_handle, resource_tag_factory, recorder_worker) = + resource_metering::init_recorder( + self.core.config.resource_metering.precision.as_millis(), + ); + self.core.to_stop.push(recorder_worker); + let (reporter_notifier, data_sink_reg_handle, reporter_worker) = + resource_metering::init_reporter( + self.core.config.resource_metering.clone(), + collector_reg_handle.clone(), + ); + self.core.to_stop.push(reporter_worker); + let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( + self.core.config.resource_metering.receiver_address.clone(), + self.env.clone(), + data_sink_reg_handle.clone(), + ); + self.core.to_stop.push(single_target_worker); + let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); + + let cfg_manager = resource_metering::ConfigManager::new( + self.core.config.resource_metering.clone(), + recorder_notifier, + reporter_notifier, + address_change_notifier, + ); + cfg_controller.register( + tikv::config::Module::ResourceMetering, + Box::new(cfg_manager), + ); + + let storage_read_pool_handle = if self.core.config.readpool.storage.use_unified_pool() { + unified_read_pool.as_ref().unwrap().handle() + } else { + let storage_read_pools = ReadPool::from(storage::build_read_pool( + &self.core.config.readpool.storage, + pd_sender.clone(), + engines.engine.clone(), + )); + storage_read_pools.handle() + }; + + let storage = Storage::<_, _, F>::from_engine( + engines.engine.clone(), + &self.core.config.storage, + storage_read_pool_handle, + lock_mgr.clone(), + self.concurrency_manager.clone(), + lock_mgr.get_storage_dynamic_configs(), + flow_controller.clone(), + pd_sender.clone(), + resource_tag_factory.clone(), + Arc::clone(&self.quota_limiter), + self.pd_client.feature_gate().clone(), + self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), + ) + .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); + cfg_controller.register( + tikv::config::Module::Storage, + Box::new(StorageConfigManger::new( + self.tablet_registry.as_ref().unwrap().clone(), + ttl_scheduler, + flow_controller, + storage.get_scheduler(), + )), + ); + + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.core.background_worker, + storage.get_engine().raft_extension(), + ); + self.resolver = Some(resolver); + + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) + .register(self.coprocessor_host.as_mut().unwrap()); + + // Create snapshot manager, server. + let snap_path = self + .core + .store_path + .join(Path::new("tablet_snap")) + .to_str() + .unwrap() + .to_owned(); + + let snap_mgr = + match TabletSnapManager::new(&snap_path, self.core.encryption_key_manager.clone()) { + Ok(mgr) => mgr, + Err(e) => fatal!("failed to create snapshot manager at {}: {}", snap_path, e), + }; + + // Create coprocessor endpoint. + let cop_read_pool_handle = if self.core.config.readpool.coprocessor.use_unified_pool() { + unified_read_pool.as_ref().unwrap().handle() + } else { + let cop_read_pools = ReadPool::from(coprocessor::readpool_impl::build_read_pool( + &self.core.config.readpool.coprocessor, + pd_sender, + engines.engine.clone(), + )); + cop_read_pools.handle() + }; + + let mut unified_read_pool_scale_receiver = None; + if self.core.config.readpool.is_unified_pool_enabled() { + let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); + cfg_controller.register( + tikv::config::Module::Readpool, + Box::new(ReadPoolConfigManager::new( + unified_read_pool.as_ref().unwrap().handle(), + unified_read_pool_scale_notifier, + &self.core.background_worker, + self.core.config.readpool.unified.max_thread_count, + self.core.config.readpool.unified.auto_adjust_pool_size, + )), + ); + unified_read_pool_scale_receiver = Some(rx); + } + + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = + Box::new(WorkerBuilder::new("check-leader").thread_count(1).create()); + // Create check leader runer. + let check_leader_runner = CheckLeaderRunner::new( + self.router.as_ref().unwrap().store_meta().clone(), + self.coprocessor_host.clone().unwrap(), + ); + let check_leader_scheduler = check_leader_worker.start("check-leader", check_leader_runner); + self.core.to_stop.push(check_leader_worker); + + // Create cdc worker. + let mut cdc_worker = self.cdc_worker.take().unwrap(); + let cdc_scheduler = self.cdc_scheduler.clone().unwrap(); + // Register cdc observer. + let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); + cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + // Register cdc config manager. + cfg_controller.register( + tikv::config::Module::Cdc, + Box::new(CdcConfigManager(cdc_scheduler.clone())), + ); + // Start cdc endpoint. + let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_endpoint = cdc::Endpoint::new( + self.core.config.server.cluster_id, + &self.core.config.cdc, + self.core.config.storage.engine == EngineType::RaftKv2, + self.core.config.storage.api_version(), + self.pd_client.clone(), + cdc_scheduler, + self.router.clone().unwrap(), + LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), + cdc_ob, + self.router.as_ref().unwrap().store_meta().clone(), + self.concurrency_manager.clone(), + self.env.clone(), + self.security_mgr.clone(), + cdc_memory_quota.clone(), + self.causal_ts_provider.clone(), + ); + cdc_worker.start_with_timer(cdc_endpoint); + self.core.to_stop.push(cdc_worker); + self.cdc_memory_quota = Some(cdc_memory_quota); + + // Create resolved ts. + if self.core.config.resolved_ts.enable { + let mut rts_worker = Box::new(LazyWorker::new("resolved-ts")); + // Register the resolved ts observer + let resolved_ts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); + resolved_ts_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + // Register config manager for resolved ts worker + cfg_controller.register( + tikv::config::Module::ResolvedTs, + Box::new(resolved_ts::ResolvedTsConfigManager::new( + rts_worker.scheduler(), + )), + ); + let rts_endpoint = resolved_ts::Endpoint::new( + &self.core.config.resolved_ts, + rts_worker.scheduler(), + self.router.clone().unwrap(), + self.router.as_ref().unwrap().store_meta().clone(), + self.pd_client.clone(), + self.concurrency_manager.clone(), + self.env.clone(), + self.security_mgr.clone(), + ); + rts_worker.start_with_timer(rts_endpoint); + self.core.to_stop.push(rts_worker); + } + + // Start backup stream + self.backup_stream_scheduler = if self.core.config.log_backup.enable { + // Create backup stream. + let mut backup_stream_worker = Box::new(LazyWorker::new("backup-stream")); + let backup_stream_scheduler = backup_stream_worker.scheduler(); + + // Register backup-stream observer. + let backup_stream_ob = BackupStreamObserver::new(backup_stream_scheduler.clone()); + backup_stream_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + // Register config manager. + cfg_controller.register( + tikv::config::Module::BackupStream, + Box::new(BackupStreamConfigManager::new( + backup_stream_worker.scheduler(), + self.core.config.log_backup.clone(), + )), + ); + + let backup_stream_endpoint = backup_stream::Endpoint::new( + self.node.as_ref().unwrap().id(), + PdStore::new(Checked::new(Sourced::new( + Arc::clone(&self.pd_client), + pd_client::meta_storage::Source::LogBackup, + ))), + self.core.config.log_backup.clone(), + backup_stream_scheduler.clone(), + backup_stream_ob, + self.region_info_accessor.as_ref().unwrap().clone(), + self.router.clone().unwrap(), + self.pd_client.clone(), + self.concurrency_manager.clone(), + BackupStreamResolver::V2(self.router.clone().unwrap(), PhantomData), + ); + backup_stream_worker.start(backup_stream_endpoint); + self.core.to_stop.push(backup_stream_worker); + Some(backup_stream_scheduler) + } else { + None + }; + + let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); + + self.core + .config + .raft_store + .validate( + self.core.config.coprocessor.region_split_size(), + self.core.config.coprocessor.enable_region_bucket(), + self.core.config.coprocessor.region_bucket_size, + ) + .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); + let raft_store = Arc::new(VersionTrack::new(self.core.config.raft_store.clone())); + let health_service = HealthService::default(); + + let node = self.node.as_ref().unwrap(); + + self.snap_mgr = Some(snap_mgr.clone()); + // Create server + let server = Server::new( + node.id(), + &server_config, + &self.security_mgr, + storage, + coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + self.quota_limiter.clone(), + ), + coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), + self.resolver.clone().unwrap(), + Either::Right(snap_mgr.clone()), + gc_worker.clone(), + check_leader_scheduler, + self.env.clone(), + unified_read_pool, + debug_thread_pool, + health_service, + self.resource_manager.clone(), + ) + .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + cfg_controller.register( + tikv::config::Module::Server, + Box::new(ServerConfigManager::new( + server.get_snap_worker_scheduler(), + server_config.clone(), + server.get_grpc_mem_quota().clone(), + )), + ); + + let import_path = self.core.store_path.join("import"); + let mut importer = SstImporter::new( + &self.core.config.import, + import_path, + self.core.encryption_key_manager.clone(), + self.core.config.storage.api_version(), + ) + .unwrap(); + for (cf_name, compression_type) in &[ + ( + CF_DEFAULT, + self.core + .config + .rocksdb + .defaultcf + .bottommost_level_compression, + ), + ( + CF_WRITE, + self.core + .config + .rocksdb + .writecf + .bottommost_level_compression, + ), + ] { + importer.set_compression_type(cf_name, from_rocks_compression_type(*compression_type)); + } + let importer = Arc::new(importer); + + // V2 starts split-check worker within raftstore. + + let split_config_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(self.core.config.split.clone()))); + cfg_controller.register( + tikv::config::Module::Split, + Box::new(split_config_manager.clone()), + ); + + let auto_split_controller = AutoSplitController::new( + split_config_manager, + self.core.config.server.grpc_concurrency, + self.core.config.readpool.unified.max_thread_count, + unified_read_pool_scale_receiver, + ); + + // `ConsistencyCheckObserver` must be registered before `Node::start`. + let safe_point = Arc::new(AtomicU64::new(0)); + let observer = match self.core.config.coprocessor.consistency_check_method { + ConsistencyCheckMethod::Mvcc => BoxConsistencyCheckObserver::new( + MvccConsistencyCheckObserver::new(safe_point.clone()), + ), + ConsistencyCheckMethod::Raw => { + BoxConsistencyCheckObserver::new(RawConsistencyCheckObserver::default()) + } + }; + self.coprocessor_host + .as_mut() + .unwrap() + .registry + .register_consistency_check_observer(100, observer); + + self.node + .as_mut() + .unwrap() + .start( + engines.raft_engine.clone(), + self.tablet_registry.clone().unwrap(), + self.router.as_ref().unwrap(), + server.transport(), + snap_mgr, + self.concurrency_manager.clone(), + self.causal_ts_provider.clone(), + self.coprocessor_host.clone().unwrap(), + auto_split_controller, + collector_reg_handle, + self.core.background_worker.clone(), + pd_worker, + raft_store, + &state, + importer.clone(), + self.core.encryption_key_manager.clone(), + ) + .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); + + // Start auto gc. Must after `Node::start` because `node_id` is initialized + // there. + let store_id = self.node.as_ref().unwrap().id(); + let auto_gc_config = AutoGcConfig::new( + self.pd_client.clone(), + self.region_info_accessor.clone().unwrap(), + store_id, + ); + gc_worker + .start(store_id) + .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); + if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { + fatal!("failed to start auto_gc on storage, error: {}", e); + } + + initial_metric(&self.core.config.metric); + + self.servers = Some(Servers { + lock_mgr, + server, + importer, + rsmeter_pubsub_service, + }); + + server_config + } + + fn register_services(&mut self) { + let servers = self.servers.as_mut().unwrap(); + let engines = self.engines.as_ref().unwrap(); + + // Backup service. + let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); + let backup_scheduler = backup_worker.scheduler(); + let backup_service = backup::Service::::new(backup_scheduler); + if servers + .server + .register_service(create_backup(backup_service)) + .is_some() + { + fatal!("failed to register backup service"); + } + + let backup_endpoint = backup::Endpoint::new( + self.node.as_ref().unwrap().id(), + engines.engine.clone(), + self.region_info_accessor.clone().unwrap(), + LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), + self.core.config.backup.clone(), + self.concurrency_manager.clone(), + self.core.config.storage.api_version(), + self.causal_ts_provider.clone(), + ); + self.cfg_controller.as_mut().unwrap().register( + tikv::config::Module::Backup, + Box::new(backup_endpoint.get_config_manager()), + ); + backup_worker.start(backup_endpoint); + + // Import SST service. + let import_service = ImportSstService::new( + self.core.config.import.clone(), + self.core.config.raft_store.raft_entry_max_size, + engines.engine.clone(), + LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), + servers.importer.clone(), + ); + let import_cfg_mgr = import_service.get_config_manager(); + + if servers + .server + .register_service(create_import_sst(import_service)) + .is_some() + { + fatal!("failed to register import service"); + } + + if let Some(sched) = self.backup_stream_scheduler.take() { + let pitr_service = backup_stream::Service::new(sched); + if servers + .server + .register_service(create_log_backup(pitr_service)) + .is_some() + { + fatal!("failed to register log backup service"); + } + } + + self.cfg_controller + .as_mut() + .unwrap() + .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + + let cdc_service = cdc::Service::new( + self.cdc_scheduler.as_ref().unwrap().clone(), + self.cdc_memory_quota.as_ref().unwrap().clone(), + ); + if servers + .server + .register_service(create_change_data(cdc_service)) + .is_some() + { + fatal!("failed to register cdc service"); + } + + // Create Diagnostics service + let diag_service = DiagnosticsService::new( + servers.server.get_debug_thread_pool().clone(), + self.core.config.log.file.filename.clone(), + self.core.config.slow_log_file.clone(), + ); + if servers + .server + .register_service(create_diagnostics(diag_service)) + .is_some() + { + fatal!("failed to register diagnostics service"); + } + + // Lock manager. + if servers + .server + .register_service(create_deadlock(servers.lock_mgr.deadlock_service())) + .is_some() + { + fatal!("failed to register deadlock service"); + } + + servers + .lock_mgr + .start( + self.node.as_ref().unwrap().id(), + self.pd_client.clone(), + self.resolver.clone().unwrap(), + self.security_mgr.clone(), + &self.core.config.pessimistic_txn, + ) + .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); + + if servers + .server + .register_service(create_resource_metering_pub_sub( + servers.rsmeter_pubsub_service.clone(), + )) + .is_some() + { + warn!("failed to register resource metering pubsub service"); + } + } + + fn init_metrics_flusher( + &mut self, + fetcher: BytesFetcher, + engines_info: Arc, + ) { + let mut engine_metrics = EngineMetricsManager::::new( + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.core.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().raft_engine.clone(), + self.raft_statistics.clone(), + ); + let mut io_metrics = IoMetricsManager::new(fetcher); + let engines_info_clone = engines_info.clone(); + + // region_id -> (suffix, tablet) + // `update` of EnginesResourceInfo is called perodically which needs this map + // for recording the latest tablet for each region. + // `cached_latest_tablets` is passed to `update` to avoid memory + // allocation each time when calling `update`. + let mut cached_latest_tablets = HashMap::default(); + self.core.background_worker.spawn_interval_task( + DEFAULT_METRICS_FLUSH_INTERVAL, + move || { + let now = Instant::now(); + engine_metrics.flush(now); + io_metrics.flush(now); + engines_info_clone.update(now, &mut cached_latest_tablets); + }, + ); + if let Some(limiter) = get_io_rate_limiter() { + limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); + } + + let mut mem_trace_metrics = MemoryTraceManager::default(); + mem_trace_metrics.register_provider(MEMTRACE_RAFTSTORE.clone()); + mem_trace_metrics.register_provider(MEMTRACE_COPROCESSOR.clone()); + self.core.background_worker.spawn_interval_task( + DEFAULT_MEMTRACE_FLUSH_INTERVAL, + move || { + let now = Instant::now(); + mem_trace_metrics.flush(now); + }, + ); + } + + fn init_storage_stats_task(&self) { + let config_disk_capacity: u64 = self.core.config.raft_store.capacity.0; + let data_dir = self.core.config.storage.data_dir.clone(); + let store_path = self.core.store_path.clone(); + let snap_mgr = self.snap_mgr.clone().unwrap(); + let reserve_space = disk::get_disk_reserved_space(); + let reserve_raft_space = disk::get_raft_disk_reserved_space(); + if reserve_space == 0 && reserve_raft_space == 0 { + info!("disk space checker not enabled"); + return; + } + let raft_engine = self.engines.as_ref().unwrap().raft_engine.clone(); + let tablet_registry = self.tablet_registry.clone().unwrap(); + let raft_path = raft_engine.get_engine_path().to_string(); + let separated_raft_mount_path = + path_in_diff_mount_point(raft_path.as_str(), tablet_registry.tablet_root()); + let raft_almost_full_threshold = reserve_raft_space; + let raft_already_full_threshold = reserve_raft_space / 2; + + let almost_full_threshold = reserve_space; + let already_full_threshold = reserve_space / 2; + fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } + } + self.core.background_worker + .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { + let disk_stats = match fs2::statvfs(&store_path) { + Err(e) => { + error!( + "get disk stat for kv store failed"; + "kv path" => store_path.to_str(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + let snap_size = snap_mgr.total_snap_size().unwrap(); + + let mut kv_size = 0; + tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + + let raft_size = raft_engine + .get_engine_size() + .expect("get raft engine size"); + + let mut raft_disk_status = disk::DiskUsage::Normal; + if separated_raft_mount_path && reserve_raft_space != 0 { + let raft_disk_stats = match fs2::statvfs(&raft_path) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft engine path" => raft_path.clone(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let raft_disk_cap = raft_disk_stats.total_space(); + let mut raft_disk_available = + raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); + raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); + raft_disk_status = if raft_disk_available <= raft_already_full_threshold + { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= raft_almost_full_threshold + { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + } + let placeholer_file_path = PathBuf::from_str(&data_dir) + .unwrap() + .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); + + let placeholder_size: u64 = + file_system::get_file_size(placeholer_file_path).unwrap_or(0); + + let used_size = if !separated_raft_mount_path { + snap_size + kv_size + raft_size + placeholder_size + } else { + snap_size + kv_size + placeholder_size + }; + let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { + disk_cap + } else { + config_disk_capacity + }; + + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + available = cmp::min(available, disk_stats.available_space()); + + let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. + let cur_kv_disk_status = if available <= already_full_threshold { + disk::DiskUsage::AlreadyFull + } else if available <= almost_full_threshold { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); + if prev_disk_status != cur_disk_status { + warn!( + "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", + prev_disk_status, + cur_disk_status, + raft_disk_status, + cur_kv_disk_status, + separated_raft_mount_path, + available, + snap_size, + kv_size, + raft_size, + capacity + ); + } + disk::set_disk_status(cur_disk_status); + }) + } + + fn init_sst_recovery_sender(&mut self) -> Option> { + if !self + .core + .config + .storage + .background_error_recovery_window + .is_zero() + { + let sst_worker = Box::new(LazyWorker::new("sst-recovery")); + let scheduler = sst_worker.scheduler(); + self.sst_worker = Some(sst_worker); + Some(scheduler) + } else { + None + } + } + + fn run_server(&mut self, server_config: Arc>) { + let server = self.servers.as_mut().unwrap(); + server + .server + .build_and_bind() + .unwrap_or_else(|e| fatal!("failed to build server: {}", e)); + server + .server + .start( + server_config, + self.security_mgr.clone(), + self.tablet_registry.clone().unwrap(), + ) + .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); + } + + fn run_status_server(&mut self) { + // Create a status server. + let status_enabled = !self.core.config.server.status_addr.is_empty(); + if status_enabled { + let mut status_server = match StatusServer::new( + self.core.config.server.status_thread_pool_size, + self.cfg_controller.take().unwrap(), + Arc::new(self.core.config.security.clone()), + self.engines.as_ref().unwrap().engine.raft_extension(), + self.core.store_path.clone(), + self.resource_manager.clone(), + ) { + Ok(status_server) => Box::new(status_server), + Err(e) => { + error_unknown!(%e; "failed to start runtime for status service"); + return; + } + }; + // Start the status server. + if let Err(e) = status_server.start(self.core.config.server.status_addr.clone()) { + error_unknown!(%e; "failed to bind addr for status service"); + } else { + self.core.to_stop.push(status_server); + } + } + } + + fn stop(mut self) { + tikv_util::thread_group::mark_shutdown(); + let mut servers = self.servers.unwrap(); + servers + .server + .stop() + .unwrap_or_else(|e| fatal!("failed to stop server: {}", e)); + + self.node.as_mut().unwrap().stop(); + self.region_info_accessor.as_mut().unwrap().stop(); + + servers.lock_mgr.stop(); + + if let Some(sst_worker) = self.sst_worker { + sst_worker.stop_worker(); + } + + self.core.to_stop.into_iter().for_each(|s| s.stop()); + } +} + +impl TikvServer { + fn init_engines( + &mut self, + flow_listener: engine_rocks::FlowListener, + ) -> Arc { + let block_cache = self + .core + .config + .storage + .block_cache + .build_shared_cache(self.core.config.storage.engine); + let env = self + .core + .config + .build_shared_rocks_env( + self.core.encryption_key_manager.clone(), + get_io_rate_limiter(), + ) + .unwrap(); + + // Create raft engine + let (raft_engine, raft_statistics) = CER::build( + &self.core.config, + &env, + &self.core.encryption_key_manager, + &block_cache, + ); + self.raft_statistics = raft_statistics; + + // Create kv engine. + let builder = KvEngineFactoryBuilder::new(env, &self.core.config, block_cache) + .sst_recovery_sender(self.init_sst_recovery_sender()) + .flow_listener(flow_listener); + + let mut node = NodeV2::new(&self.core.config.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&self.core.config.raft_store, &raft_engine) + .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); + assert_ne!(node.id(), 0); + + let router = node.router().clone(); + + // Create kv engine. + let builder = builder.state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); + let factory = Box::new(builder.build()); + self.kv_statistics = Some(factory.rocks_statistics()); + let registry = TabletRegistry::new(factory, self.core.store_path.join("tablets")) + .unwrap_or_else(|e| fatal!("failed to create tablet registry {:?}", e)); + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + cfg_controller.register( + tikv::config::Module::Rocksdb, + Box::new(DbConfigManger::new(registry.clone(), DbType::Kv)), + ); + self.tablet_registry = Some(registry.clone()); + raft_engine.register_config(cfg_controller); + + let engines_info = Arc::new(EnginesResourceInfo::new( + registry, + raft_engine.as_rocks_engine().cloned(), + 180, // max_samples_to_preserve + )); + + let router = RaftRouter::new(node.id(), router); + let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( + router.store_router().clone(), + self.core.config.coprocessor.clone(), + ); + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let cdc_worker = Box::new(LazyWorker::new("cdc")); + let cdc_scheduler = cdc_worker.scheduler(); + let txn_extra_scheduler = cdc::CdcTxnExtraScheduler::new(cdc_scheduler.clone()); + + let mut engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + // Set txn extra scheduler immediately to make sure every clone has the + // scheduler. + engine.set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); + + self.engines = Some(TikvEngines { + raft_engine, + engine, + }); + self.router = Some(router); + self.node = Some(node); + self.coprocessor_host = Some(coprocessor_host); + self.region_info_accessor = Some(region_info_accessor); + self.cdc_worker = Some(cdc_worker); + self.cdc_scheduler = Some(cdc_scheduler); + + engines_info + } +} + +/// Various sanity-checks and logging before running a server. +/// +/// Warnings are logged. +/// +/// # Logs +/// +/// The presence of these environment variables that affect the database +/// behavior is logged. +/// +/// - `GRPC_POLL_STRATEGY` +/// - `http_proxy` and `https_proxy` +/// +/// # Warnings +/// +/// - if `net.core.somaxconn` < 32768 +/// - if `net.ipv4.tcp_syncookies` is not 0 +/// - if `vm.swappiness` is not 0 +/// - if data directories are not on SSDs +/// - if the "TZ" environment variable is not set on unix +fn pre_start() { + check_environment_variables(); + for e in tikv_util::config::check_kernel() { + warn!( + "check: kernel"; + "err" => %e + ); + } +} + +#[cfg(test)] +mod test { + use std::{collections::HashMap, sync::Arc}; + + use engine_rocks::raw::Env; + use engine_traits::{ + FlowControlFactorsExt, MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, + }; + use tempfile::Builder; + use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; + use tikv_util::{config::ReadableSize, time::Instant}; + + use super::EnginesResourceInfo; + + #[test] + fn test_engines_resource_info_update() { + let mut config = TikvConfig::default(); + config.rocksdb.defaultcf.disable_auto_compactions = true; + config.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.writecf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + let env = Arc::new(Env::default()); + let path = Builder::new().prefix("test-update").tempdir().unwrap(); + let cache = config + .storage + .block_cache + .build_shared_cache(config.storage.engine); + + let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); + + for i in 1..6 { + let ctx = TabletContext::with_infinite_region(i, Some(10)); + reg.load(ctx, true).unwrap(); + } + + let mut cached = reg.get(1).unwrap(); + let mut tablet = cached.latest().unwrap(); + // Prepare some data for two tablets of the same region. So we can test whether + // we fetch the bytes from the latest one. + for i in 1..21 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let old_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + let ctx = TabletContext::with_infinite_region(1, Some(20)); + reg.load(ctx, true).unwrap(); + tablet = cached.latest().unwrap(); + + for i in 1..11 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let new_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); + + let engines_info = Arc::new(EnginesResourceInfo::new(reg, None, 10)); + + let mut cached_latest_tablets = HashMap::default(); + engines_info.update(Instant::now(), &mut cached_latest_tablets); + + // The memory allocation should be reserved + assert!(cached_latest_tablets.capacity() >= 5); + // The tablet cache should be cleared + assert!(cached_latest_tablets.is_empty()); + + // The latest_normalized_pending_bytes should be equal to the pending compaction + // bytes of tablet_1_20 + assert_eq!( + (new_pending_compaction_bytes * 100) as u32, + engines_info.latest_normalized_pending_bytes() + ); + } +} diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index 0c657733f54..5742eda8bc8 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -10,13 +10,14 @@ use std::{ use chrono::Local; use clap::ArgMatches; use collections::HashMap; -use tikv::config::{check_critical_config, persist_config, MetricConfig, TiKvConfig}; +use tikv::config::{check_critical_config, persist_config, MetricConfig, TikvConfig}; use tikv_util::{self, config, logger}; // A workaround for checking if log is initialized. pub static LOG_INITIALIZED: AtomicBool = AtomicBool::new(false); -// The info log file names does not end with ".log" since it conflict with rocksdb WAL files. +// The info log file names does not end with ".log" since it conflict with +// rocksdb WAL files. pub const DEFAULT_ROCKSDB_LOG_FILE: &str = "rocksdb.info"; pub const DEFAULT_RAFTDB_LOG_FILE: &str = "raftdb.info"; @@ -33,11 +34,12 @@ macro_rules! fatal { }) } -// TODO: There is a very small chance that duplicate files will be generated if there are -// a lot of logs written in a very short time. Consider rename the rotated file with a version -// number while rotate by size. +// TODO: There is a very small chance that duplicate files will be generated if +// there are a lot of logs written in a very short time. Consider rename the +// rotated file with a version number while rotate by size. // -// The file name format after rotated is as follows: "{original name}.{"%Y-%m-%dT%H-%M-%S%.3f"}" +// The file name format after rotated is as follows: +// "{original name}.{"%Y-%m-%dT%H-%M-%S%.3f"}" fn rename_by_timestamp(path: &Path) -> io::Result { let mut new_path = path.parent().unwrap().to_path_buf(); let mut new_fname = path.file_stem().unwrap().to_os_string(); @@ -72,11 +74,12 @@ fn make_engine_log_path(path: &str, sub_path: &str, filename: &str) -> String { } #[allow(dead_code)] -pub fn initial_logger(config: &TiKvConfig) { +pub fn initial_logger(config: &TikvConfig) { let rocksdb_info_log_path = if !config.rocksdb.info_log_dir.is_empty() { make_engine_log_path(&config.rocksdb.info_log_dir, "", DEFAULT_ROCKSDB_LOG_FILE) } else { - // Don't use `DEFAULT_ROCKSDB_SUB_DIR`, because of the logic of `RocksEngine::exists`. + // Don't use `DEFAULT_ROCKSDB_SUB_DIR`, because of the logic of + // `RocksEngine::exists`. make_engine_log_path(&config.storage.data_dir, "", DEFAULT_ROCKSDB_LOG_FILE) }; let raftdb_info_log_path = if !config.raftdb.info_log_dir.is_empty() { @@ -139,7 +142,7 @@ pub fn initial_logger(config: &TiKvConfig) { rocksdb: R, raftdb: T, slow: Option, - config: &TiKvConfig, + config: &TikvConfig, ) where N: slog::Drain + Send + 'static, R: slog::Drain + Send + 'static, @@ -150,9 +153,11 @@ pub fn initial_logger(config: &TiKvConfig) { let drainer = logger::LogDispatcher::new(normal, rocksdb, raftdb, slow); let level = config.log.level; let slow_threshold = config.slow_log_threshold.as_millis(); - logger::init_log(drainer, level, true, true, vec![], slow_threshold).unwrap_or_else(|e| { - fatal!("failed to initialize log: {}", e); - }); + logger::init_log(drainer, level.into(), true, true, vec![], slow_threshold).unwrap_or_else( + |e| { + fatal!("failed to initialize log: {}", e); + }, + ); } macro_rules! do_build { @@ -233,10 +238,10 @@ pub fn initial_metric(cfg: &MetricConfig) { } #[allow(dead_code)] -pub fn overwrite_config_with_cmd_args(config: &mut TiKvConfig, matches: &ArgMatches<'_>) { +pub fn overwrite_config_with_cmd_args(config: &mut TikvConfig, matches: &ArgMatches<'_>) { if let Some(level) = matches.value_of("log-level") { - config.log.level = logger::get_level_by_string(level).unwrap(); - config.log_level = slog::Level::Info; + config.log.level = logger::get_level_by_string(level).unwrap().into(); + config.log_level = slog::Level::Info.into(); } if let Some(file) = matches.value_of("log-file") { @@ -298,7 +303,7 @@ pub fn overwrite_config_with_cmd_args(config: &mut TiKvConfig, matches: &ArgMatc } #[allow(dead_code)] -pub fn validate_and_persist_config(config: &mut TiKvConfig, persist: bool) { +pub fn validate_and_persist_config(config: &mut TikvConfig, persist: bool) { config.compatible_adjust(); if let Err(e) = config.validate() { fatal!("invalid configuration: {}", e); diff --git a/components/server/src/signal_handler.rs b/components/server/src/signal_handler.rs index 5b73154241b..0977a1ed814 100644 --- a/components/server/src/signal_handler.rs +++ b/components/server/src/signal_handler.rs @@ -1,21 +1,34 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::Arc; + +use engine_rocks::RocksStatistics; +use engine_traits::{Engines, KvEngine, RaftEngine}; + pub use self::imp::wait_for_signal; #[cfg(unix)] mod imp { - use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; - use libc::c_int; - use signal::{trap::Trap, Signal::*}; + use engine_traits::MiscExt; + use signal_hook::{ + consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, + iterator::Signals, + }; use tikv_util::metrics; + use super::*; + #[allow(dead_code)] - pub fn wait_for_signal(engines: Option>) { - let trap = Trap::trap(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]); - for sig in trap { - match sig { + pub fn wait_for_signal( + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, + ) { + let mut signals = Signals::new([SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); + for signal in &mut signals { + match signal { SIGTERM | SIGINT | SIGHUP => { - info!("receive signal {}, stopping server...", sig as c_int); + info!("receive signal {}, stopping server...", signal); break; } SIGUSR1 => { @@ -23,7 +36,13 @@ mod imp { info!("{}", metrics::dump(false)); if let Some(ref engines) = engines { info!("{:?}", MiscExt::dump_stats(&engines.kv)); + if let Some(s) = kv_statistics.as_ref() && let Some(s) = s.to_string() { + info!("{:?}", s); + } info!("{:?}", RaftEngine::dump_stats(&engines.raft)); + if let Some(s) = raft_statistics.as_ref() && let Some(s) = s.to_string() { + info!("{:?}", s); + } } } // TODO: handle more signal @@ -35,7 +54,12 @@ mod imp { #[cfg(not(unix))] mod imp { - use engine_traits::{Engines, KvEngine, RaftEngine}; + use super::*; - pub fn wait_for_signal(_: Option>) {} + pub fn wait_for_signal( + _: Option>, + _: Option>, + _: Option>, + ) { + } } diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml new file mode 100644 index 00000000000..4768759b852 --- /dev/null +++ b/components/snap_recovery/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "snap_recovery" +version = "0.1.0" +edition = "2021" +publish = false +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono = "0.4" +encryption = { workspace = true } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +futures = { version = "0.3", features = ["executor"] } +grpcio = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } +log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +pd_client = { workspace = true } +protobuf = { version = "2.8", features = ["bytes"] } +raft_log_engine = { workspace = true } +raftstore = { workspace = true } +slog = { workspace = true } +slog-global = { workspace = true } +structopt = "0.3" +tempfile = "3.0" +thiserror = "1.0" +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +toml = "0.5" +txn_types = { workspace = true } diff --git a/components/snap_recovery/src/data_resolver.rs b/components/snap_recovery/src/data_resolver.rs new file mode 100644 index 00000000000..4ef8e7a6410 --- /dev/null +++ b/components/snap_recovery/src/data_resolver.rs @@ -0,0 +1,456 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + error::Error as StdError, + ops::Bound, + result, + sync::{Arc, Mutex}, + thread::JoinHandle, + time::Instant, +}; + +use engine_rocks::{RocksEngine, RocksEngineIterator, RocksWriteBatchVec}; +use engine_traits::{ + IterOptions, Iterable, Iterator, Mutable, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, + CF_LOCK, CF_WRITE, +}; +use futures::channel::mpsc::UnboundedSender; +use kvproto::recoverdatapb::ResolveKvDataResponse; +use thiserror::Error; +use tikv_util::sys::thread::StdThreadBuildWrapper; +use txn_types::{Key, TimeStamp, Write, WriteRef}; + +pub type Result = result::Result; + +#[allow(dead_code)] +#[derive(Debug, Error)] +pub enum Error { + #[error("Invalid Argument {0:?}")] + InvalidArgument(String), + + #[error("Not Found {0:?}")] + NotFound(String), + + #[error("Engine {0:?}")] + Engine(#[from] engine_traits::Error), + + #[error("{0:?}")] + Other(#[from] Box), +} + +/// `DataResolverManager` is the manager that manages the resolve kv data +/// process. +/// currently, we do not support retry the data resolver, tidb-operator does not +/// support apply a restore twice TODO: in future, BR may able to retry if some +/// accident +pub struct DataResolverManager { + /// The engine we are working on + engine: RocksEngine, + /// progress info + tx: UnboundedSender, + /// Current working workers + workers: Arc>>>, + resolved_ts: TimeStamp, +} + +impl Clone for DataResolverManager { + fn clone(&self) -> Self { + Self { + engine: self.engine.clone(), + tx: self.tx.clone(), + workers: Arc::new(Mutex::new(Vec::new())), + resolved_ts: self.resolved_ts, + } + } +} + +#[allow(dead_code)] +impl DataResolverManager { + pub fn new( + engine: RocksEngine, + tx: UnboundedSender, + resolved_ts: TimeStamp, + ) -> Self { + DataResolverManager { + engine, + tx, + workers: Arc::new(Mutex::new(Vec::new())), + resolved_ts, + } + } + /// Start a delete kv data process which delete all data by resolved_ts. + pub fn start(&self) { + self.resolve_lock(); + self.resolve_write(); + } + + fn resolve_lock(&self) { + let mut readopts = IterOptions::new(None, None, false); + readopts.set_hint_min_ts(Bound::Excluded(self.resolved_ts.into_inner())); + let lock_iter = self.engine.iterator_opt(CF_LOCK, readopts).unwrap(); + let mut worker = LockResolverWorker::new(lock_iter, self.tx.clone()); + let mut wb = self.engine.write_batch(); + let props = tikv_util::thread_group::current_properties(); + + let handle = std::thread::Builder::new() + .name("cleanup_lock".to_string()) + .spawn_wrapper(move || { + tikv_util::thread_group::set_properties(props); + tikv_alloc::add_thread_memory_accessor(); + + worker + .cleanup_lock(&mut wb) + .expect("cleanup lock failed when delete data from invalid cf"); + + tikv_alloc::remove_thread_memory_accessor(); + }) + .expect("failed to spawn resolve_kv_data thread"); + self.workers.lock().unwrap().push(handle); + } + + fn resolve_write(&self) { + let mut readopts = IterOptions::new(None, None, false); + readopts.set_hint_min_ts(Bound::Excluded(self.resolved_ts.into_inner())); + let write_iter = self + .engine + .iterator_opt(CF_WRITE, readopts.clone()) + .unwrap(); + let mut worker = WriteResolverWorker::new(write_iter, self.resolved_ts, self.tx.clone()); + let mut wb = self.engine.write_batch(); + let props = tikv_util::thread_group::current_properties(); + + let handle = std::thread::Builder::new() + .name("resolve_write".to_string()) + .spawn_wrapper(move || { + tikv_util::thread_group::set_properties(props); + tikv_alloc::add_thread_memory_accessor(); + + if let Err(e) = worker.resolve_write(&mut wb) { + error!("failed to resolve write cf"; + "error" => ?e); + } + + tikv_alloc::remove_thread_memory_accessor(); + }) + .expect("failed to spawn resolve_kv_data thread"); + + self.workers.lock().unwrap().push(handle); + } + + // join and wait until the thread exit + pub fn wait(&self) { + let mut last_error = None; + for h in self.workers.lock().unwrap().drain(..) { + info!("waiting for {}", h.thread().name().unwrap()); + if let Err(e) = h.join() { + error!("failed to join manager thread: {:?}", e); + last_error = Some(e); + } + } + if let Some(e) = last_error { + safe_panic!("failed to join manager thread: {:?}", e); + } + } +} +/// `LockResolverWorker` is the worker that does the clean lock cf. +pub struct LockResolverWorker { + lock_iter: RocksEngineIterator, + /// send progress of this task + tx: UnboundedSender, +} + +#[allow(dead_code)] +impl LockResolverWorker { + pub fn new( + mut lock_iter: RocksEngineIterator, + tx: UnboundedSender, + ) -> Self { + lock_iter.seek_to_first().unwrap(); + Self { lock_iter, tx } + } + pub fn cleanup_lock(&mut self, wb: &mut RocksWriteBatchVec) -> Result { + let mut key_count: u64 = 0; + while self.lock_iter.valid().unwrap() { + box_try!(wb.delete_cf(CF_LOCK, self.lock_iter.key())); + self.lock_iter.next().unwrap(); + key_count += 1; + } + info!("clean up lock cf. delete key count {}", key_count); + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + box_try!(wb.write_opt(&write_opts)); + let mut response = ResolveKvDataResponse::default(); + + response.set_resolved_key_count(key_count); + if let Err(e) = self.tx.unbounded_send(response) { + warn!("send the cleanup lock key failure {}", e); + if e.is_disconnected() { + warn!("channel is disconnected."); + return Ok(false); + } + } + Ok(true) + } +} + +// TODO: as we tested, this size may more effective than set to 256 (max write +// batch) a more robust test need to figure out what is best. +const BATCH_SIZE_LIMIT: usize = 1024 * 1024; +/// `WriteResolverWorker` is the worker that does the actual delete data work. +pub struct WriteResolverWorker { + batch_size_limit: usize, + /// `resolved_ts` is the timestamp to data delete to. + resolved_ts: TimeStamp, + write_iter: RocksEngineIterator, + /// send progress of this task + tx: UnboundedSender, +} + +/// `Batch` means a batch of writes load from the engine. +/// We scan writes in batches to prevent huge memory usage. +struct Batch { + writes: Vec<(Vec, Write)>, + has_more: bool, +} + +#[allow(dead_code)] +impl WriteResolverWorker { + pub fn new( + mut write_iter: RocksEngineIterator, + resolved_ts: TimeStamp, + tx: UnboundedSender, + ) -> Self { + write_iter.seek_to_first().unwrap(); + Self { + batch_size_limit: BATCH_SIZE_LIMIT, + write_iter, + resolved_ts, + tx, + } + } + pub fn resolve_write(&mut self, wb: &mut RocksWriteBatchVec) -> Result<()> { + let now = Instant::now(); + while self.batch_resolve_write(wb)? {} + info!("resolve write"; + "spent_time" => now.elapsed().as_secs(), + ); + Ok(()) + } + + fn next_write(&mut self) -> Result, Write)>> { + if self.write_iter.valid().unwrap() { + let write = box_try!(WriteRef::parse(self.write_iter.value())).to_owned(); + let key = self.write_iter.key().to_vec(); + self.write_iter.next().unwrap(); + return Ok(Some((key, write))); + } + Ok(None) + } + + fn scan_next_batch(&mut self) -> Result { + let mut writes = Vec::with_capacity(self.batch_size_limit); + let mut has_more = true; + + for _ in 0..self.batch_size_limit { + if let Some((key, write)) = self.next_write()? { + let commit_ts = box_try!(Key::decode_ts_from(keys::origin_key(&key))); + if commit_ts > self.resolved_ts { + writes.push((key, write)); + } + } else { + has_more = false; + break; + } + } + Ok(Batch { writes, has_more }) + } + + // delete key.commit_ts > resolved-ts in write cf and default cf + fn batch_resolve_write(&mut self, wb: &mut RocksWriteBatchVec) -> Result { + let Batch { writes, has_more } = self.scan_next_batch()?; + if has_more && writes.is_empty() { + return Ok(has_more); + } + + let batch = writes.clone(); + let mut max_ts: TimeStamp = 0.into(); + for (key, write) in writes { + let default_key = Key::from_encoded_slice(&key) + .truncate_ts() + .unwrap() + .append_ts(write.start_ts); + box_try!(wb.delete_cf(CF_WRITE, &key)); + box_try!(wb.delete_cf(CF_DEFAULT, default_key.as_encoded())); + + let commit_ts = box_try!(Key::decode_ts_from(keys::origin_key(&key))); + if commit_ts > max_ts { + max_ts = commit_ts; + } + } + info!( + "flush delete in write/default cf."; + "delete_key_count" => batch.len(), + ); + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + wb.write_opt(&write_opts)?; + + let mut response = ResolveKvDataResponse::default(); + + response.set_resolved_key_count(batch.len().try_into().unwrap()); + response.set_current_commit_ts(max_ts.into_inner()); + if let Err(e) = self.tx.unbounded_send(response) { + warn!("send the resolved key failure {}", e); + if e.is_disconnected() { + warn!("channel is disconnected."); + return Ok(has_more); + } + } + + Ok(has_more) + } +} +#[cfg(test)] +mod tests { + use engine_traits::{WriteBatch, WriteBatchExt, ALL_CFS, CF_LOCK}; + use futures::channel::mpsc; + use tempfile::Builder; + use txn_types::{Lock, LockType, WriteType}; + + use super::*; + + #[test] + fn test_data_resolver() { + let tmp = Builder::new() + .prefix("test_data_resolver") + .tempdir() + .unwrap(); + let path = tmp.path().to_str().unwrap(); + let fake_engine = engine_rocks::util::new_engine(path, ALL_CFS).unwrap(); + + // insert some keys, and resolved base on 100 + // write cf will remain one key + let write = vec![ + // key, start_ts, commit_ts + (b"k", 189, 190), + (b"k", 122, 123), + (b"k", 110, 111), + (b"k", 98, 99), + ]; + let default = vec![ + // key, start_ts + (b"k", 189), + (b"k", 122), + (b"k", 110), + (b"k", 98), + ]; + let lock = vec![ + // key, start_ts, for_update_ts, lock_type, short_value, check + (b"k", 100, 0, LockType::Put, false), + (b"k", 100, 0, LockType::Delete, false), + (b"k", 99, 0, LockType::Put, true), + (b"k", 98, 0, LockType::Delete, true), + ]; + let mut kv = vec![]; + for (key, start_ts, commit_ts) in write { + let write = Write::new(WriteType::Put, start_ts.into(), None); + kv.push(( + CF_WRITE, + Key::from_raw(key).append_ts(commit_ts.into()), + write.as_ref().to_bytes(), + )); + } + for (key, ts) in default { + kv.push(( + CF_DEFAULT, + Key::from_raw(key).append_ts(ts.into()), + b"v".to_vec(), + )); + } + for (key, ts, for_update_ts, tp, short_value) in lock { + let v = if short_value { + Some(b"v".to_vec()) + } else { + None + }; + let lock = Lock::new( + tp, + vec![], + ts.into(), + 0, + v, + for_update_ts.into(), + 0, + TimeStamp::zero(), + ); + kv.push((CF_LOCK, Key::from_raw(key), lock.to_bytes())); + } + let mut wb = fake_engine.write_batch(); + for &(cf, ref k, ref v) in &kv { + wb.put_cf(cf, &keys::data_key(k.as_encoded()), v).unwrap(); + } + wb.write().unwrap(); + + let (tx, _) = mpsc::unbounded(); + let resolver = DataResolverManager::new(fake_engine.clone(), tx, 100.into()); + resolver.start(); + // wait to delete finished + resolver.wait(); + + // write cf will remain only one key + let readopts = IterOptions::new(None, None, false); + let mut write_iter = fake_engine + .iterator_opt(CF_WRITE, readopts.clone()) + .unwrap(); + write_iter.seek_to_first().unwrap(); + let mut remaining_writes = vec![]; + while write_iter.valid().unwrap() { + let write = WriteRef::parse(write_iter.value()).unwrap().to_owned(); + let key = write_iter.key().to_vec(); + write_iter.next().unwrap(); + remaining_writes.push((key, write)); + } + + // default cf will remain only one key + let mut default_iter = fake_engine + .iterator_opt(CF_DEFAULT, readopts.clone()) + .unwrap(); + default_iter.seek_to_first().unwrap(); + let mut remaining_defaults = vec![]; + while default_iter.valid().unwrap() { + let key = default_iter.key().to_vec(); + let value = default_iter.value().to_vec(); + default_iter.next().unwrap(); + remaining_defaults.push((key, value)); + } + + // lock cf will be clean + let mut lock_iter = fake_engine.iterator_opt(CF_LOCK, readopts).unwrap(); + lock_iter.seek_to_first().unwrap(); + let mut remaining_locks = vec![]; + while lock_iter.valid().unwrap() { + let lock = Lock::parse(lock_iter.value()).unwrap().to_owned(); + let key = lock_iter.key().to_vec(); + lock_iter.next().unwrap(); + remaining_locks.push((key, lock)); + } + + // Writes which start_ts >= 100 should be removed. + assert_eq!(remaining_writes.len(), 1); + let (key, _) = &remaining_writes[0]; + // So the only write left is the one with start_ts = 99 + assert_eq!( + Key::from_encoded(key.clone()).decode_ts().unwrap(), + 99.into() + ); + // Defaults corresponding to the removed writes should be removed. + assert_eq!(remaining_defaults.len(), 1); + let (key, _) = &remaining_defaults[0]; + assert_eq!( + Key::from_encoded(key.clone()).decode_ts().unwrap(), + 98.into() + ); + // All locks should be removed. + assert!(remaining_locks.is_empty()); + } +} diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs new file mode 100644 index 00000000000..4e72a19d6a6 --- /dev/null +++ b/components/snap_recovery/src/init_cluster.rs @@ -0,0 +1,364 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cmp, error::Error as StdError, i32, result, sync::Arc, thread, time::Duration}; + +use encryption_export::data_key_manager_from_config; +use engine_rocks::{util::new_engine_opt, RocksEngine}; +use engine_traits::{Engines, Error as EngineError, Peekable, RaftEngine, SyncMutable}; +use kvproto::{metapb, raft_serverpb::StoreIdent}; +use pd_client::{Error as PdError, PdClient}; +use raft_log_engine::RaftLogEngine; +use raftstore::store::initial_region; +use thiserror::Error; +use tikv::{ + config::TikvConfig, + server::{config::Config as ServerConfig, KvEngineFactoryBuilder}, +}; +use tikv_util::{ + config::{ReadableDuration, ReadableSize, VersionTrack}, + sys::SysQuota, +}; + +const CLUSTER_BOOTSTRAPPED_MAX_RETRY: u64 = 60; +const CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); +pub const LOCK_FILE_ERROR: &str = "IO error: While lock file"; + +#[allow(dead_code)] +// TODO: ERROR need more specific +#[derive(Debug, Error)] +pub enum Error { + #[error("Invalid Argument {0:?}")] + InvalidArgument(String), + + #[error("Not Found {0:?}")] + NotFound(String), + + #[error("{0:?}")] + Other(#[from] Box), +} + +pub type Result = result::Result; + +// snapshot recovery +// recovery mode parameter +const SNAP_MAX_TIMEOUT: usize = 12 * 60 * 60; + +// may deleted after ban the asksplit from PD +const MAX_REGION_SIZE: u64 = 1024; +const MAX_SPLIT_KEY: u64 = 1 << 31; + +/// Run a TiKV server in recovery mode +/// recovery mode include: +/// 1. no election happen between raft group +/// 2. peer valid during a recovery time even without leader in its region +/// 3. PD can not put any peer into tombstone +/// 4. must ensure all region data with ts less than backup ts (below commit +/// index) are safe +pub fn enter_snap_recovery_mode(config: &mut TikvConfig) { + // TOOD: if we do not have to restart TiKV, then, we need exit the recovery mode + // and bring the following parameter back. + info!("adjust the raft configure and rocksdb config."); + let bt = config.raft_store.raft_base_tick_interval.0; + + config.raft_store.raft_election_timeout_ticks = SNAP_MAX_TIMEOUT; + config.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(4 * 60 * 60); + // time to check if peer alive without the leader, will not check peer during + // this time interval + config.raft_store.peer_stale_state_check_interval = + ReadableDuration(bt * 4 * SNAP_MAX_TIMEOUT as _); + + // duration allow a peer alive without leader in region, otherwise report the + // metrics and show peer as abnormal + config.raft_store.abnormal_leader_missing_duration = + ReadableDuration(bt * 4 * SNAP_MAX_TIMEOUT as _); + + // duration allow a peer alive without leader in region, otherwise report the PD + // and delete itself(peer) + config.raft_store.max_leader_missing_duration = + ReadableDuration(bt * 4 * SNAP_MAX_TIMEOUT as _); + + // for optimize the write + config.raft_store.snap_generator_pool_size = 20; + // applied snapshot mem size + config.raft_store.snap_apply_batch_size = ReadableSize::gb(1); + // max snapshot file size, if larger than it, file be splitted. + config.raft_store.max_snapshot_file_raw_size = ReadableSize::gb(1); + config.raft_store.hibernate_regions = false; + + // disable auto compactions during the restore + config.rocksdb.defaultcf.disable_auto_compactions = true; + config.rocksdb.writecf.disable_auto_compactions = true; + config.rocksdb.lockcf.disable_auto_compactions = true; + config.rocksdb.raftcf.disable_auto_compactions = true; + + // for cpu = 1, take a reasonable value min[32, maxValue]. + let limit = (SysQuota::cpu_cores_quota() * 10.0) as i32; + config.rocksdb.max_background_jobs = cmp::min(32, limit); + // disable resolve ts during the recovery + config.resolved_ts.enable = false; + + // ebs volume has very poor performance during restore, it easy to cause the + // raft client timeout, at the same time clean up all message included + // significant message. restore is not memory sensetive, we may keep + // messages as much as possible during the network disturbing in recovery mode + config.server.raft_client_max_backoff = ReadableDuration::secs(20); + + // Disable region split during recovering. + config.coprocessor.region_max_size = Some(ReadableSize::gb(MAX_REGION_SIZE)); + config.coprocessor.region_split_size = Some(ReadableSize::gb(MAX_REGION_SIZE)); + config.coprocessor.region_max_keys = Some(MAX_SPLIT_KEY); + config.coprocessor.region_split_keys = Some(MAX_SPLIT_KEY); +} + +// update the cluster_id and bootcluster in pd before tikv startup +pub fn start_recovery(config: TikvConfig, cluster_id: u64, pd_client: Arc) { + let local_engine_service = create_local_engine_service(&config) + .unwrap_or_else(|e| panic!("create a local engine reader failure, error is {}", e)); + + local_engine_service.set_cluster_id(cluster_id); + info!("update cluster id {} from pd in recovery mode", cluster_id); + let store_id = local_engine_service.get_store_id().unwrap_or_else(|e| { + panic!( + "can not found the store id from boot storage, error is {:?}", + e + ) + }); + + let server_config = Arc::new(VersionTrack::new(config.server.clone())); + let _ = bootcluster( + &server_config.value().clone(), + cluster_id, + store_id, + pd_client, + ); +} + +// since we do not recover pd store meta, we have to bootcluster from pd by +// first region. +fn bootcluster( + cfg: &ServerConfig, + cluster_id: u64, + store_id: u64, + pd_client: Arc, +) -> Result<()> { + // build a store from config for bootcluster + let mut store = metapb::Store::default(); + store.set_id(store_id); + if cfg.advertise_addr.is_empty() { + store.set_address(cfg.addr.clone()); + } else { + store.set_address(cfg.advertise_addr.clone()) + } + if cfg.advertise_status_addr.is_empty() { + store.set_status_address(cfg.status_addr.clone()); + } else { + store.set_status_address(cfg.advertise_status_addr.clone()) + } + store.set_version(env!("CARGO_PKG_VERSION").to_string()); + + if let Ok(path) = std::env::current_exe() { + if let Some(path) = path.parent() { + store.set_deploy_path(path.to_string_lossy().to_string()); + } + }; + + store.set_start_timestamp(chrono::Local::now().timestamp()); + store.set_git_hash( + option_env!("TIKV_BUILD_GIT_HASH") + .unwrap_or("Unknown git hash") + .to_string(), + ); + + let mut labels = Vec::new(); + for (k, v) in &cfg.labels { + let mut label = metapb::StoreLabel::default(); + label.set_key(k.to_owned()); + label.set_value(v.to_owned()); + labels.push(label); + } + + store.set_labels(labels.into()); + + // init a region to boot pd cluster.· + let region_id = pd_client + .alloc_id() + .unwrap_or_else(|e| panic!("get allocate id for region failure, error is {:?}", e)); + let peer_id = pd_client + .alloc_id() + .unwrap_or_else(|e| panic!("get allocate id for peer failure, error is {:?}", e)); + debug!( + "alloc first peer id for first region"; + "peer_id" => peer_id, + "region_id" => region_id, + ); + + let region = initial_region(store_id, region_id, peer_id); + + // bootstrap cluster to pd + let mut retry = 0; + while retry < CLUSTER_BOOTSTRAPPED_MAX_RETRY { + match pd_client.bootstrap_cluster(store.clone(), region.clone()) { + Ok(_) => { + info!("bootstrap cluster ok in recovery mode"; "cluster_id" => cluster_id); + return Ok(()); + } + Err(PdError::ClusterBootstrapped(_)) => match pd_client.get_region(b"") { + Ok(first_region) => { + if region == first_region { + return Ok(()); + } else { + info!( + "cluster is already bootstrapped in recovery mode; cluster_id {}", + cluster_id + ); + } + return Ok(()); + } + Err(e) => { + warn!("bootstrap cluster failure; error is {:?}", e); + } + }, + Err(e) => error!( + "bootstrap cluster failure, cluster_id {}, error is {:?}", + cluster_id, e + ), + } + retry += 1; + thread::sleep(CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); + } + Err(box_err!("bootstrapped cluster failed")) +} +// the service to operator the local engines +pub trait LocalEngineService { + fn set_cluster_id(&self, cluster_id: u64); + fn get_store_id(&self) -> Result; +} + +// init engine and read local engine info +pub struct LocalEngines { + engines: Engines, +} + +impl LocalEngines { + pub fn new(engines: Engines) -> LocalEngines { + LocalEngines { engines } + } + + pub fn get_engine(&self) -> &Engines { + &self.engines + } +} + +impl LocalEngineService for LocalEngines { + fn set_cluster_id(&self, cluster_id: u64) { + let res = self + .get_engine() + .kv + .get_msg::(keys::STORE_IDENT_KEY) + .unwrap_or_else(|e| { + panic!("there is not ident in store, error is {:?}", e); + }); + + if res.is_none() { + return; + } + + let mut ident = res.unwrap(); + ident.set_cluster_id(cluster_id); + + self.get_engine() + .kv + .put_msg::(keys::STORE_IDENT_KEY, &ident) + .unwrap(); + self.engines.sync_kv().unwrap(); + } + + // return cluster id and store id for registry the store to PD + fn get_store_id(&self) -> Result { + let res = self + .engines + .kv + .get_msg::(keys::STORE_IDENT_KEY) + .unwrap_or_else(|e| panic!("get store id failure, error is {:?}", e)); + + let ident = res.unwrap(); + + let store_id = ident.get_store_id(); + if store_id == 0 { + error!("invalid store to report"); + } + + Ok(store_id) + } +} + +fn handle_engine_error(err: EngineError) -> ! { + error!("error while open kvdb: {}", err); + if let EngineError::Engine(msg) = err { + if msg.state().contains(LOCK_FILE_ERROR) { + error!( + "LOCK file conflict indicates TiKV process is running. \ + Do NOT delete the LOCK file and force the command to run. \ + Doing so could cause data corruption." + ); + } + } + + tikv_util::logger::exit_process_gracefully(-1); +} + +// raft log engine could be a raft engine or rocksdb +pub fn create_local_engine_service( + config: &TikvConfig, +) -> std::result::Result, String> { + // init env for init kv db and raft engine + let key_manager = + data_key_manager_from_config(&config.security.encryption, &config.storage.data_dir) + .map_err(|e| format!("init encryption manager: {}", e))? + .map(Arc::new); + let env = config + .build_shared_rocks_env(key_manager.clone(), None) + .map_err(|e| format!("build shared rocks env: {}", e))?; + let block_cache = config + .storage + .block_cache + .build_shared_cache(config.storage.engine); + + // init rocksdb / kv db + let factory = KvEngineFactoryBuilder::new(env.clone(), config, block_cache) + .lite(true) + .build(); + let kv_db = match factory.create_shared_db(&config.storage.data_dir) { + Ok(db) => db, + Err(e) => handle_engine_error(e), + }; + + // init raft engine, either is rocksdb or raft engine + if !config.raft_engine.enable { + // rocksdb + let raft_db_opts = config.raftdb.build_opt(env, None); + let raft_db_cf_opts = config.raftdb.build_cf_opts(factory.block_cache()); + let raft_path = config + .infer_raft_db_path(None) + .map_err(|e| format!("infer raftdb path: {}", e))?; + let raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { + Ok(db) => db, + Err(e) => handle_engine_error(e), + }; + + let local_engines = LocalEngines::new(Engines::new(kv_db, raft_db)); + Ok(Box::new(local_engines) as Box) + } else { + // raft engine + let mut cfg = config.raft_engine.config(); + cfg.dir = config.infer_raft_engine_path(None).unwrap(); + if !RaftLogEngine::exists(&cfg.dir) { + error!("raft engine not exists: {}", cfg.dir); + tikv_util::logger::exit_process_gracefully(-1); + } + let raft_db = RaftLogEngine::new(cfg, key_manager, None /* io_rate_limiter */).unwrap(); + let local_engines = LocalEngines::new(Engines::new(kv_db, raft_db)); + + Ok(Box::new(local_engines) as Box) + } +} diff --git a/components/snap_recovery/src/lib.rs b/components/snap_recovery/src/lib.rs new file mode 100644 index 00000000000..2ed4a97c5d3 --- /dev/null +++ b/components/snap_recovery/src/lib.rs @@ -0,0 +1,12 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod init_cluster; +pub mod services; +#[macro_use] +extern crate tikv_util; + +pub use init_cluster::{enter_snap_recovery_mode, start_recovery}; +pub use services::RecoveryService; + +mod data_resolver; +mod region_meta_collector; diff --git a/components/snap_recovery/src/region_meta_collector.rs b/components/snap_recovery/src/region_meta_collector.rs new file mode 100644 index 00000000000..16e53b3b88b --- /dev/null +++ b/components/snap_recovery/src/region_meta_collector.rs @@ -0,0 +1,230 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, error::Error as StdError, result, thread::JoinHandle}; + +use engine_rocks::RocksEngine; +use engine_traits::{Engines, Iterable, Peekable, RaftEngine, CF_RAFT}; +use futures::channel::mpsc::UnboundedSender; +use kvproto::{ + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, + recoverdatapb::*, +}; +use thiserror::Error; +use tikv_util::sys::thread::StdThreadBuildWrapper; + +pub type Result = result::Result; + +#[allow(dead_code)] +#[derive(Debug, Error)] +pub enum Error { + #[error("Invalid Argument {0:?}")] + InvalidArgument(String), + + #[error("Not Found {0:?}")] + NotFound(String), + + #[error("{0:?}")] + Other(#[from] Box), +} + +/// `RegionMetaCollector` is the collector that collector all region meta +pub struct RegionMetaCollector { + /// The engine we are working on + engines: Engines, + /// region meta report to br + tx: UnboundedSender, + /// Current working workers + worker_handle: RefCell>>, +} + +#[allow(dead_code)] +impl RegionMetaCollector { + pub fn new(engines: Engines, tx: UnboundedSender) -> Self { + RegionMetaCollector { + engines, + tx, + worker_handle: RefCell::new(None), + } + } + /// Start a collector and region meta report. + pub fn start_report(&self) { + let worker = CollectWorker::new(self.engines.clone(), self.tx.clone()); + let props = tikv_util::thread_group::current_properties(); + *self.worker_handle.borrow_mut() = Some( + std::thread::Builder::new() + .name("collector_region_meta".to_string()) + .spawn_wrapper(move || { + tikv_util::thread_group::set_properties(props); + tikv_alloc::add_thread_memory_accessor(); + + worker + .collect_report() + .expect("collect region meta and report to br failure."); + + tikv_alloc::remove_thread_memory_accessor(); + }) + .expect("failed to spawn collector_region_meta thread"), + ); + } + + // join and wait until the thread exit + pub fn wait(&self) { + if let Err(e) = self.worker_handle.take().unwrap().join() { + error!("failed to join thread: {:?}", e); + } + } +} + +struct CollectWorker { + /// The engine we are working on + engines: Engines, + tx: UnboundedSender, +} + +impl CollectWorker { + pub fn new(engines: Engines, tx: UnboundedSender) -> Self { + CollectWorker { engines, tx } + } + + fn get_local_region(&self, region_id: u64) -> Result { + let raft_state = box_try!(self.engines.raft.get_raft_state(region_id)); + + let apply_state_key = keys::apply_state_key(region_id); + let apply_state = box_try!( + self.engines + .kv + .get_msg_cf::(CF_RAFT, &apply_state_key) + ); + + let region_state_key = keys::region_state_key(region_id); + let region_state = box_try!( + self.engines + .kv + .get_msg_cf::(CF_RAFT, ®ion_state_key) + ); + + match (raft_state, apply_state, region_state) { + (None, None, None) => Err(Error::NotFound(format!("info for region {}", region_id))), + (raft_state, apply_state, region_state) => { + Ok(LocalRegion::new(raft_state, apply_state, region_state)) + } + } + } + + /// collect all region and report to br + pub fn collect_report(&self) -> Result { + let db = &self.engines.kv; + let cf = CF_RAFT; + let start_key = keys::REGION_META_MIN_KEY; + let end_key = keys::REGION_META_MAX_KEY; + let mut regions = Vec::with_capacity(1024); + box_try!(db.scan(cf, start_key, end_key, false, |key, _| { + let (id, suffix) = box_try!(keys::decode_region_meta_key(key)); + if suffix != keys::REGION_STATE_SUFFIX { + return Ok(true); + } + regions.push(id); + Ok(true) + })); + + for region_id in regions { + let region_state = self.get_local_region(region_id)?; + + // It's safe to unwrap region_local_state here, since region_id guarantees that + // the region state exists + if region_state.region_local_state.as_ref().unwrap().state == PeerState::Tombstone { + continue; + } + + region_state.raft_local_state.as_ref().ok_or_else(|| { + Error::Other(format!("No RaftLocalState found for region {}", region_id).into()) + })?; + region_state.raft_apply_state.as_ref().ok_or_else(|| { + Error::Other(format!("No RaftApplyState found for region {}", region_id).into()) + })?; + + // send to br + let response = region_state.to_region_meta(); + + if let Err(e) = self.tx.unbounded_send(response) { + warn!("send the region meta failure"; + "err" => ?e); + if e.is_disconnected() { + warn!("channel is disconnected."); + return Ok(false); + } + } + } + Ok(true) + } +} + +#[derive(PartialEq, Debug, Default)] +pub struct LocalRegion { + pub raft_local_state: Option, + pub raft_apply_state: Option, + pub region_local_state: Option, +} + +impl LocalRegion { + fn new( + raft_local: Option, + raft_apply: Option, + region_local: Option, + ) -> Self { + LocalRegion { + raft_local_state: raft_local, + raft_apply_state: raft_apply, + region_local_state: region_local, + } + } + + // fetch local region info into a gRPC message structure RegionMeta + fn to_region_meta(&self) -> RegionMeta { + let mut region_meta = RegionMeta::default(); + region_meta.region_id = self.region_local_state.as_ref().unwrap().get_region().id; + region_meta.peer_id = self + .region_local_state + .as_ref() + .unwrap() + .get_region() + .get_peers() + .to_vec() + .iter() + .max_by_key(|p| p.id) + .unwrap() + .get_id(); + region_meta.version = self + .region_local_state + .as_ref() + .unwrap() + .get_region() + .get_region_epoch() + .version; + region_meta.tombstone = + self.region_local_state.as_ref().unwrap().state == PeerState::Tombstone; + region_meta.start_key = self + .region_local_state + .as_ref() + .unwrap() + .get_region() + .get_start_key() + .to_owned(); + region_meta.end_key = self + .region_local_state + .as_ref() + .unwrap() + .get_region() + .get_end_key() + .to_owned(); + region_meta.last_log_term = self + .raft_local_state + .as_ref() + .unwrap() + .get_hard_state() + .term; + region_meta.last_index = self.raft_local_state.as_ref().unwrap().last_index; + + region_meta + } +} diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs new file mode 100644 index 00000000000..98d1942c7ff --- /dev/null +++ b/components/snap_recovery/src/services.rs @@ -0,0 +1,399 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + error::Error as StdError, + result, + sync::mpsc::{sync_channel, SyncSender}, + thread::Builder, + time::Instant, +}; + +use engine_rocks::{ + raw::{CompactOptions, DBBottommostLevelCompaction}, + util::get_cf_handle, + RocksEngine, +}; +use engine_traits::{CfNamesExt, CfOptionsExt, Engines, Peekable, RaftEngine}; +use futures::{ + channel::mpsc, + executor::{ThreadPool, ThreadPoolBuilder}, + FutureExt, SinkExt, StreamExt, +}; +use grpcio::{ + ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, +}; +use kvproto::{raft_serverpb::StoreIdent, recoverdatapb::*}; +use raftstore::{ + router::RaftStoreRouter, + store::{ + fsm::RaftRouter, + msg::{Callback, CasualMessage, PeerMsg, SignificantMsg}, + transport::SignificantRouter, + SnapshotRecoveryWaitApplySyncer, + }, +}; +use thiserror::Error; +use tikv_util::sys::thread::{StdThreadBuildWrapper, ThreadBuildWrapper}; + +use crate::{data_resolver::DataResolverManager, region_meta_collector::RegionMetaCollector}; + +pub type Result = result::Result; + +#[allow(dead_code)] +#[derive(Debug, Error)] +pub enum Error { + #[error("Invalid Argument {0:?}")] + InvalidArgument(String), + + #[error("{0:?}")] + Grpc(#[from] grpcio::Error), + + #[error("Engine {0:?}")] + Engine(#[from] engine_traits::Error), + + #[error("{0:?}")] + Other(#[from] Box), +} +/// Service handles the recovery messages from backup restore. +#[derive(Clone)] +pub struct RecoveryService { + engines: Engines, + router: RaftRouter, + threads: ThreadPool, +} + +impl RecoveryService { + /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a + /// `thread pool`. + pub fn new( + engines: Engines, + router: RaftRouter, + ) -> RecoveryService { + let props = tikv_util::thread_group::current_properties(); + let threads = ThreadPoolBuilder::new() + .pool_size(4) + .name_prefix("recovery-service") + .after_start_wrapper(move || { + tikv_util::thread_group::set_properties(props.clone()); + tikv_alloc::add_thread_memory_accessor(); + }) + .before_stop_wrapper(|| tikv_alloc::remove_thread_memory_accessor()) + .create() + .unwrap(); + + // config rocksdb l0 to optimize the restore + // also for massive data applied during the restore, it easy to reach the write + // stop + let db = engines.kv.clone(); + for cf_name in db.cf_names() { + Self::set_db_options(cf_name, db.clone()).expect("set db option failure"); + } + + RecoveryService { + engines, + router, + threads, + } + } + + pub fn set_db_options(cf_name: &str, engine: RocksEngine) -> Result<()> { + let level0_stop_writes_trigger: u32 = 1 << 30; + let level0_slowdown_writes_trigger: u32 = 1 << 30; + let opts = [ + ( + "level0_stop_writes_trigger".to_owned(), + level0_stop_writes_trigger.to_string(), + ), + ( + "level0_slowdown_writes_trigger".to_owned(), + level0_slowdown_writes_trigger.to_string(), + ), + ]; + + let tmp_opts: Vec<_> = opts.iter().map(|(k, v)| (k.as_str(), v.as_str())).collect(); + engine.set_options_cf(cf_name, tmp_opts.as_slice()).unwrap(); + Ok(()) + } + + // return cluster id and store id for registry the store to PD + fn get_store_id(&self) -> Result { + let res = self + .engines + .kv + .get_msg::(keys::STORE_IDENT_KEY) + .unwrap(); + if res.is_none() { + return Ok(0); + } + + let ident = res.unwrap(); + let store_id = ident.get_store_id(); + if store_id == 0 { + error!("invalid store to report"); + } + Ok(store_id) + } + + // a new wait apply syncer share with all regions, + // when all region reached the target index, share reference decreased to 0, + // trigger closure to send finish info back. + pub fn wait_apply_last(router: RaftRouter, sender: SyncSender) { + let wait_apply = SnapshotRecoveryWaitApplySyncer::new(0, sender); + router.broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( + wait_apply.clone(), + )) + }); + } +} + +/// This may a temp solution, in future, we may move forward to FlashBack +/// delete data Compact the cf[start..end) in the db. +/// purpose of it to resolve compaction filter gc after restore cluster +fn compact(engine: RocksEngine) -> Result<()> { + let mut handles = Vec::new(); + for cf_name in engine.cf_names() { + let cf = cf_name.to_owned().clone(); + let kv_db = engine.clone(); + let h = Builder::new() + .name(format!("compact-{}", cf)) + .spawn_wrapper(move || { + info!("recovery starts manual compact"; "cf" => cf.clone()); + tikv_alloc::add_thread_memory_accessor(); + let db = kv_db.as_inner(); + let handle = get_cf_handle(db, cf.as_str()).unwrap(); + let mut compact_opts = CompactOptions::new(); + compact_opts.set_max_subcompactions(64); + compact_opts.set_exclusive_manual_compaction(false); + compact_opts.set_bottommost_level_compaction(DBBottommostLevelCompaction::Skip); + db.compact_range_cf_opt(handle, &compact_opts, None, None); + tikv_alloc::remove_thread_memory_accessor(); + + info!("recovery finishes manual compact"; "cf" => cf); + }) + .expect("failed to spawn compaction thread"); + handles.push(h); + } + for h in handles { + h.join() + .unwrap_or_else(|e| error!("thread handle join error"; "error" => ?e)); + } + Ok(()) +} + +impl RecoverData for RecoveryService { + // 1. br start to ready region meta + fn read_region_meta( + &mut self, + _ctx: RpcContext<'_>, + _req: ReadRegionMetaRequest, + mut sink: ServerStreamingSink, + ) { + let (tx, rx) = mpsc::unbounded(); + // tx only clone once within RegionMetaCollector, so that it drop automatically + // when work thread done + let meta_collector = RegionMetaCollector::new(self.engines.clone(), tx); + info!("start to collect region meta"); + meta_collector.start_report(); + let send_task = async move { + let mut s = rx.map(|resp| Ok((resp, WriteFlags::default()))); + sink.send_all(&mut s).await?; + sink.close().await?; + Ok(()) + } + .map(|res: Result<()>| match res { + Ok(_) => { + info!("collect region meta done"); + } + Err(e) => { + error!("rcollect region meta failure"; "error" => ?e); + } + }); + + self.threads.spawn_ok(send_task); + } + + // 2. br start to recover region + // assign region leader and wait leader apply to last log + fn recover_region( + &mut self, + _ctx: RpcContext<'_>, + mut stream: RequestStream, + sink: ClientStreamingSink, + ) { + let raft_router = self.router.clone(); + let store_id = self.get_store_id(); + info!("start to recover the region"); + let task = async move { + let mut leaders = Vec::new(); + while let Some(req) = stream.next().await { + let req = req.map_err(|e| eprintln!("rpc recv fail: {}", e)).unwrap(); + if req.as_leader { + leaders.push(req.region_id); + } + } + + let mut rxs = Vec::with_capacity(leaders.len()); + for ®ion_id in &leaders { + if let Err(e) = raft_router.send_casual_msg(region_id, CasualMessage::Campaign) { + // TODO: retry may necessay + warn!("region fails to campaign: "; + "region_id" => region_id, + "err" => ?e); + continue; + } else { + info!("region starts to campaign"; + "region_id" => region_id); + } + + let (tx, rx) = sync_channel(1); + let callback = Callback::read(Box::new(move |_| { + if tx.send(1).is_err() { + error!("response failed"; "region_id" => region_id); + } + })); + if let Err(e) = raft_router + .significant_send(region_id, SignificantMsg::LeaderCallback(callback)) + { + warn!("LeaderCallback failed"; "err" => ?e, "region_id" => region_id); + } + rxs.push(Some(rx)); + } + + // leader is campaign and be ensured as leader + for (_rid, rx) in leaders.iter().zip(rxs) { + if let Some(rx) = rx { + match rx.recv() { + Ok(_id) => { + info!("leader is assigned for region"); + } + Err(e) => { + error!("check leader failed"; "error" => ?e); + } + } + } + } + + info!("all region leader assigned done"); + + let now = Instant::now(); + // wait apply to the last log + let mut rx_apply = Vec::with_capacity(leaders.len()); + for ®ion_id in &leaders { + let (tx, rx) = sync_channel(1); + let wait_apply = SnapshotRecoveryWaitApplySyncer::new(region_id, tx.clone()); + if let Err(e) = raft_router.significant_send( + region_id, + SignificantMsg::SnapshotRecoveryWaitApply(wait_apply.clone()), + ) { + error!( + "failed to send wait apply"; + "region_id" => region_id, + "err" => ?e, + ); + } + rx_apply.push(Some(rx)); + } + + // leader apply to last log + for (_rid, rx) in leaders.iter().zip(rx_apply) { + if let Some(rx) = rx { + match rx.recv() { + Ok(region_id) => { + info!("leader apply to last log"; "error" => region_id); + } + Err(e) => { + error!("leader failed to apply to last log"; "error" => ?e); + } + } + } + } + + info!( + "all region leader apply to last log"; + "spent_time" => now.elapsed().as_secs(), + ); + + let mut resp = RecoverRegionResponse::default(); + match store_id { + Ok(id) => resp.set_store_id(id), + Err(e) => error!("failed to get store id"; "error" => ?e), + }; + + let _ = sink.success(resp).await; + }; + + self.threads.spawn_ok(task); + } + + // 3. ensure all region peer/follower apply to last + fn wait_apply( + &mut self, + _ctx: RpcContext<'_>, + _req: WaitApplyRequest, + sink: UnarySink, + ) { + let router = self.router.clone(); + info!("wait_apply start"); + let task = async move { + let now = Instant::now(); + let (tx, rx) = sync_channel(1); + RecoveryService::wait_apply_last(router, tx.clone()); + match rx.recv() { + Ok(id) => { + info!("follower apply to last log"; "error" => id); + } + Err(e) => { + error!("follower failed to apply to last log"; "error" => ?e); + } + } + info!( + "all region apply to last log"; + "spent_time" => now.elapsed().as_secs(), + ); + let resp = WaitApplyResponse::default(); + let _ = sink.success(resp).await; + }; + + self.threads.spawn_ok(task); + } + + // 4.resolve kv data to a backup resolved-tss + fn resolve_kv_data( + &mut self, + _ctx: RpcContext<'_>, + req: ResolveKvDataRequest, + mut sink: ServerStreamingSink, + ) { + // implement a resolve/delete data funciton + let resolved_ts = req.get_resolved_ts(); + let (tx, rx) = mpsc::unbounded(); + let resolver = DataResolverManager::new(self.engines.kv.clone(), tx, resolved_ts.into()); + info!("start to resolve kv data"); + resolver.start(); + let db = self.engines.kv.clone(); + let store_id = self.get_store_id(); + let send_task = async move { + let id = store_id?; + let mut s = rx.map(|mut resp| { + // TODO: a metric need here + resp.set_store_id(id); + Ok((resp, WriteFlags::default())) + }); + sink.send_all(&mut s).await?; + compact(db.clone())?; + sink.close().await?; + Ok(()) + } + .map(|res: Result<()>| match res { + Ok(_) => { + info!("resolve kv data done"); + } + Err(e) => { + error!("resolve kv data error"; "error" => ?e); + } + }); + + self.threads.spawn_ok(send_task); + } +} diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 887c9df6655..8e2799b7437 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -13,36 +13,38 @@ cloud-storage-grpc = ["external_storage_export/cloud-storage-grpc"] cloud-storage-dylib = ["external_storage_export/cloud-storage-dylib"] [dependencies] -api_version = { path = "../api_version", default-features = false } +api_version = { workspace = true } crc32fast = "1.2" dashmap = "5" -encryption = { path = "../encryption", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } -external_storage_export = { path = "../external_storage/export", default-features = false } -file_system = { path = "../file_system", default-features = false } +encryption = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } futures = { version = "0.3", features = ["thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } +online_config = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", default-features = false } +rand = "0.8" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt-multi-thread", "macros"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } [dev-dependencies] tempfile = "3.0" -test_sst_importer = { path = "../test_sst_importer", default-features = false } -test_util = { path = "../test_util", default-features = false } +test_sst_importer = { workspace = true } +test_util = { workspace = true } diff --git a/components/sst_importer/src/caching/cache_map.rs b/components/sst_importer/src/caching/cache_map.rs new file mode 100644 index 00000000000..e88e5c3545d --- /dev/null +++ b/components/sst_importer/src/caching/cache_map.rs @@ -0,0 +1,211 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::{mapref::entry::Entry, DashMap}; +use futures::Future; + +use crate::metrics::EXT_STORAGE_CACHE_COUNT; + +#[derive(Clone, Default)] +pub struct CacheMap(Arc>); + +impl CacheMap { + #[cfg(test)] + pub fn with_inner(inner: CacheMapInner) -> Self { + Self(Arc::new(inner)) + } +} + +pub trait ShareOwned { + type Shared: 'static; + + fn share_owned(&self) -> Self::Shared; +} + +impl ShareOwned for T { + type Shared = T; + + fn share_owned(&self) -> Self::Shared { + *self + } +} + +pub trait MakeCache: 'static { + type Cached: std::fmt::Debug + ShareOwned + Send + Sync + 'static; + type Error; + + fn make_cache(&self) -> std::result::Result; +} + +#[derive(Debug)] +pub struct CacheMapInner { + cached: DashMap>, + now: AtomicUsize, + + gc_threshold: usize, +} + +impl Default for CacheMapInner { + fn default() -> Self { + Self { + cached: DashMap::default(), + now: Default::default(), + gc_threshold: 20, + } + } +} + +impl CacheMapInner { + #[cfg(test)] + pub fn with_gc_threshold(n: usize) -> Self { + Self { + gc_threshold: n, + ..Self::default() + } + } +} + +#[derive(Debug)] +struct Cached { + resource: R, + last_used: usize, +} + +impl Cached { + fn new(resource: R) -> Self { + Self { + resource, + last_used: 0, + } + } + + fn resource_owned(&mut self, now: usize) -> ::Shared { + self.last_used = now; + self.resource.share_owned() + } +} + +impl CacheMapInner { + fn now(&self) -> usize { + self.now.load(Ordering::SeqCst) + } + + fn tick(&self) { + let now = self.now.fetch_add(1usize, Ordering::SeqCst); + self.cached.retain(|name, cache| { + let need_hold = now.saturating_sub(cache.last_used) < self.gc_threshold; + if !need_hold { + info!("Removing cache due to expired."; "name" => %name, "entry" => ?cache); + } + need_hold + }); + } +} + +impl CacheMap { + pub fn gc_loop(&self) -> impl Future + Send + 'static { + let this = Arc::downgrade(&self.0); + async move { + loop { + tokio::time::sleep(Duration::from_secs(30)).await; + match this.upgrade() { + Some(inner) => inner.tick(), + None => return, + } + } + } + } + + pub fn cached_or_create( + &self, + cache_key: &str, + backend: &M, + ) -> std::result::Result<::Shared, M::Error> { + let s = self.0.cached.get_mut(cache_key); + match s { + Some(mut s) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["hit"]).inc(); + Ok(s.value_mut().resource_owned(self.0.now())) + } + None => { + drop(s); + let e = self.0.cached.entry(cache_key.to_owned()); + match e { + Entry::Occupied(mut v) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["hit"]).inc(); + Ok(v.get_mut().resource_owned(self.0.now())) + } + Entry::Vacant(v) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["miss"]).inc(); + let pool = backend.make_cache()?; + info!("Insert storage cache."; "name" => %cache_key, "cached" => ?pool); + let shared = pool.share_owned(); + v.insert(Cached::new(pool)); + Ok(shared) + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + convert::Infallible, + sync::atomic::{AtomicBool, Ordering}, + }; + + use super::{CacheMap, CacheMapInner, MakeCache}; + + #[derive(Default)] + struct CacheChecker(AtomicBool); + + impl MakeCache for CacheChecker { + type Cached = (); + type Error = Infallible; + + fn make_cache(&self) -> std::result::Result { + self.0.store(true, Ordering::SeqCst); + Ok(()) + } + } + + impl CacheChecker { + fn made_cache(&self) -> bool { + self.0.load(Ordering::SeqCst) + } + } + + #[test] + fn test_basic() { + let cached = CacheMapInner::with_gc_threshold(1); + let cached = CacheMap::with_inner(cached); + + let check_cache = |key, should_make_cache: bool| { + let c = CacheChecker::default(); + cached.cached_or_create(key, &c).unwrap(); + assert_eq!(c.made_cache(), should_make_cache); + }; + + check_cache("hello", true); + check_cache("hello", false); + check_cache("world", true); + + cached.0.tick(); + check_cache("hello", false); + + cached.0.tick(); + check_cache("world", true); + + cached.0.tick(); + check_cache("hello", true); + } +} diff --git a/components/sst_importer/src/caching/mod.rs b/components/sst_importer/src/caching/mod.rs new file mode 100644 index 00000000000..9e55717c601 --- /dev/null +++ b/components/sst_importer/src/caching/mod.rs @@ -0,0 +1,4 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod cache_map; +pub mod storage_cache; diff --git a/components/sst_importer/src/caching/storage_cache.rs b/components/sst_importer/src/caching/storage_cache.rs new file mode 100644 index 00000000000..23732545b92 --- /dev/null +++ b/components/sst_importer/src/caching/storage_cache.rs @@ -0,0 +1,58 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use external_storage_export::ExternalStorage; +use kvproto::brpb::StorageBackend; + +use super::cache_map::{MakeCache, ShareOwned}; +use crate::{Error, Result}; + +impl ShareOwned for StoragePool { + type Shared = Arc; + + fn share_owned(&self) -> Self::Shared { + self.get() + } +} + +impl MakeCache for StorageBackend { + type Cached = StoragePool; + type Error = Error; + + fn make_cache(&self) -> Result { + StoragePool::create(self, 16) + } +} + +pub struct StoragePool(Box<[Arc]>); + +impl StoragePool { + fn create(backend: &StorageBackend, size: usize) -> Result { + let mut r = Vec::with_capacity(size); + for _ in 0..size { + let s = external_storage_export::create_storage(backend, Default::default())?; + r.push(Arc::from(s)); + } + Ok(Self(r.into_boxed_slice())) + } + + fn get(&self) -> Arc { + use rand::Rng; + let idx = rand::thread_rng().gen_range(0..self.0.len()); + Arc::clone(&self.0[idx]) + } +} + +impl std::fmt::Debug for StoragePool { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let url = self + .get() + .url() + .map(|u| u.to_string()) + .unwrap_or_else(|_| "".to_owned()); + f.debug_tuple("StoragePool") + .field(&format_args!("{}", url)) + .finish() + } +} diff --git a/components/sst_importer/src/config.rs b/components/sst_importer/src/config.rs index a25d34ea24b..7434c5cf0cd 100644 --- a/components/sst_importer/src/config.rs +++ b/components/sst_importer/src/config.rs @@ -1,10 +1,15 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error, result::Result}; +use std::{ + error::Error, + result::Result, + sync::{Arc, RwLock}, +}; -use tikv_util::config::ReadableDuration; +use online_config::{self, OnlineConfig}; +use tikv_util::{config::ReadableDuration, HandyRwLock}; -#[derive(Clone, Serialize, Deserialize, PartialEq, Debug)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { @@ -14,6 +19,8 @@ pub struct Config { /// /// Default is 10m. pub import_mode_timeout: ReadableDuration, + /// the ratio of system memory used for import. + pub memory_use_ratio: f64, } impl Default for Config { @@ -22,18 +29,71 @@ impl Default for Config { num_threads: 8, stream_channel_window: 128, import_mode_timeout: ReadableDuration::minutes(10), + memory_use_ratio: 0.3, } } } impl Config { - pub fn validate(&self) -> Result<(), Box> { + pub fn validate(&mut self) -> Result<(), Box> { + let default_cfg = Config::default(); if self.num_threads == 0 { - return Err("import.num_threads can not be 0".into()); + warn!( + "import.num_threads can not be 0, change it to {}", + default_cfg.num_threads + ); + self.num_threads = default_cfg.num_threads; } if self.stream_channel_window == 0 { - return Err("import.stream_channel_window can not be 0".into()); + warn!( + "import.stream_channel_window can not be 0, change it to {}", + default_cfg.stream_channel_window + ); + self.stream_channel_window = default_cfg.stream_channel_window; + } + if self.memory_use_ratio > 0.5 || self.memory_use_ratio < 0.0 { + return Err("import.mem_ratio should belong to [0.0, 0.5].".into()); } Ok(()) } } + +#[derive(Clone)] +pub struct ConfigManager(pub Arc>); + +impl ConfigManager { + pub fn new(cfg: Config) -> Self { + ConfigManager(Arc::new(RwLock::new(cfg))) + } +} + +impl online_config::ConfigManager for ConfigManager { + fn dispatch(&mut self, change: online_config::ConfigChange) -> online_config::Result<()> { + info!( + "import config changed"; + "change" => ?change, + ); + + let mut cfg = self.rl().clone(); + cfg.update(change)?; + + if let Err(e) = cfg.validate() { + warn!( + "import config changed"; + "change" => ?cfg, + ); + return Err(e); + } + + *self.wl() = cfg; + Ok(()) + } +} + +impl std::ops::Deref for ConfigManager { + type Target = RwLock; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 3fc229aa6ee..7ff940fff12 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -8,7 +8,7 @@ use encryption::Error as EncryptionError; use error_code::{self, ErrorCode, ErrorCodeExt}; use futures::channel::oneshot::Canceled; use grpcio::Error as GrpcError; -use kvproto::{import_sstpb, kvrpcpb::ApiVersion}; +use kvproto::{errorpb, import_sstpb, kvrpcpb::ApiVersion}; use tikv_util::codec::Error as CodecError; use uuid::Error as UuidError; @@ -19,7 +19,7 @@ pub fn error_inc(type_: &str, err: &Error) { Error::Io(..) => "io", Error::Grpc(..) => "grpc", Error::Uuid(..) => "uuid", - Error::RocksDB(..) => "rocksdb", + Error::RocksDb(..) => "rocksdb", Error::EngineTraits(..) => "engine_traits", Error::ParseIntError(..) => "parse_int", Error::FileExists(..) => "file_exists", @@ -52,7 +52,7 @@ pub enum Error { // FIXME: Remove concrete 'rocks' type #[error("RocksDB {0}")] - RocksDB(String), + RocksDb(String), #[error("Engine {0:?}")] EngineTraits(#[from] engine_traits::Error), @@ -122,6 +122,9 @@ pub enum Error { storage_api_version: ApiVersion, key: String, }, + + #[error("resource is not enough {0}")] + ResourceNotEnough(String), } impl Error { @@ -140,7 +143,7 @@ impl Error { impl From for Error { fn from(msg: String) -> Self { - Self::RocksDB(msg) + Self::RocksDb(msg) } } @@ -149,7 +152,19 @@ pub type Result = result::Result; impl From for import_sstpb::Error { fn from(e: Error) -> import_sstpb::Error { let mut err = import_sstpb::Error::default(); - err.set_message(format!("{}", e)); + match e { + Error::ResourceNotEnough(ref msg) => { + let mut import_err = errorpb::Error::default(); + import_err.set_message(msg.clone()); + import_err.set_server_is_busy(errorpb::ServerIsBusy::default()); + err.set_store_error(import_err); + err.set_message(format!("{}", e)); + } + _ => { + err.set_message(format!("{}", e)); + } + } + err } } @@ -161,7 +176,7 @@ impl ErrorCodeExt for Error { Error::Grpc(_) => error_code::sst_importer::GRPC, Error::Uuid(_) => error_code::sst_importer::UUID, Error::Future(_) => error_code::sst_importer::FUTURE, - Error::RocksDB(_) => error_code::sst_importer::ROCKSDB, + Error::RocksDb(_) => error_code::sst_importer::ROCKSDB, Error::EngineTraits(e) => e.error_code(), Error::ParseIntError(_) => error_code::sst_importer::PARSE_INT_ERROR, Error::FileExists(..) => error_code::sst_importer::FILE_EXISTS, @@ -181,6 +196,7 @@ impl ErrorCodeExt for Error { Error::TtlLenNotEqualsToPairs => error_code::sst_importer::TTL_LEN_NOT_EQUALS_TO_PAIRS, Error::IncompatibleApiVersion => error_code::sst_importer::INCOMPATIBLE_API_VERSION, Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, + Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, } } } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index 7c02b058d1e..84d2f67bbab 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -11,8 +11,11 @@ use std::{ use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use encryption::{DataKeyManager, EncrypterWriter}; use engine_rocks::{get_env, RocksSstReader}; -use engine_traits::{EncryptionKeyManager, Iterable, KvEngine, SstMetaInfo, SstReader}; +use engine_traits::{ + iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, +}; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; +use keys::data_key; use kvproto::{import_sstpb::*, kvrpcpb::ApiVersion}; use tikv_util::time::Instant; use uuid::{Builder as UuidBuilder, Uuid}; @@ -50,7 +53,6 @@ pub struct ImportPath { impl ImportPath { // move file from temp to save. pub fn save(mut self, key_manager: Option<&DataKeyManager>) -> Result<()> { - file_system::rename(&self.temp, &self.save)?; if let Some(key_manager) = key_manager { let temp_str = self .temp @@ -61,7 +63,15 @@ impl ImportPath { .to_str() .ok_or_else(|| Error::InvalidSstPath(self.save.clone()))?; key_manager.link_file(temp_str, save_str)?; - key_manager.delete_file(temp_str)?; + let r = file_system::rename(&self.temp, &self.save); + let del_file = if r.is_ok() { temp_str } else { save_str }; + if let Err(e) = key_manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'save'"; + "file" => ?self, "err" => ?e); + } + r?; + } else { + file_system::rename(&self.temp, &self.save)?; } // sync the directory after rename self.save.pop(); @@ -137,12 +147,19 @@ impl ImportFile { "finalize SST write cache", )); } - file_system::rename(&self.path.temp, &self.path.save)?; if let Some(ref manager) = self.key_manager { let tmp_str = self.path.temp.to_str().unwrap(); let save_str = self.path.save.to_str().unwrap(); manager.link_file(tmp_str, save_str)?; - manager.delete_file(self.path.temp.to_str().unwrap())?; + let r = file_system::rename(&self.path.temp, &self.path.save); + let del_file = if r.is_ok() { tmp_str } else { save_str }; + if let Err(e) = manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during finishing importing files."; + "err" => ?e); + } + r?; + } else { + file_system::rename(&self.path.temp, &self.path.save)?; } Ok(()) } @@ -167,6 +184,10 @@ impl ImportFile { } Ok(()) } + + pub fn get_import_path(&self) -> &ImportPath { + &self.path + } } impl fmt::Debug for ImportFile { @@ -227,9 +248,9 @@ impl ImportDir { /// Make an import path base on the basic path and the file name. pub fn get_import_path(&self, file_name: &str) -> Result { - let save_path = self.root_dir.join(&file_name); - let temp_path = self.temp_dir.join(&file_name); - let clone_path = self.clone_dir.join(&file_name); + let save_path = self.root_dir.join(file_name); + let temp_path = self.temp_dir.join(file_name); + let clone_path = self.clone_dir.join(file_name); Ok(ImportPath { save: save_path, temp: temp_path, @@ -256,7 +277,7 @@ impl ImportDir { pub fn delete_file(&self, path: &Path, key_manager: Option<&DataKeyManager>) -> Result<()> { if path.exists() { - file_system::remove_file(&path)?; + file_system::remove_file(path)?; if let Some(manager) = key_manager { manager.delete_file(path.to_str().unwrap())?; } @@ -302,7 +323,8 @@ impl ImportDir { for meta in metas { match (api_version, meta.api_version) { (cur_version, meta_version) if cur_version == meta_version => continue, - // sometimes client do not know whether ttl is enabled, so a general V1 is accepted as V1ttl + // sometimes client do not know whether ttl is enabled, so a general V1 is accepted + // as V1ttl (ApiVersion::V1ttl, ApiVersion::V1) => continue, // import V1ttl as V1 will immediatly be rejected because it is never correct. (ApiVersion::V1, ApiVersion::V1ttl) => return Ok(false), @@ -315,18 +337,14 @@ impl ImportDir { let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; for &(start, end) in TIDB_RANGES_COMPLEMENT { - let mut unexpected_data_key = None; - sst_reader.scan(start, end, false, |key, _| { - unexpected_data_key = Some(key.to_vec()); - Ok(false) - })?; - - if let Some(unexpected_data_key) = unexpected_data_key { + let opt = iter_option(&data_key(start), &data_key(end), false); + let mut iter = sst_reader.iter(opt)?; + if iter.seek(start)? { error!( "unable to import: switch api version with non-tidb key"; "sst" => ?meta.api_version, "current" => ?api_version, - "key" => ?log_wrappers::hex_encode_upper(&unexpected_data_key) + "key" => ?log_wrappers::hex_encode_upper(iter.key()) ); return Ok(false); } @@ -450,8 +468,9 @@ pub fn path_to_sst_meta>(path: P) -> Result { meta.mut_region_epoch().set_conf_ver(elems[2].parse()?); meta.mut_region_epoch().set_version(elems[3].parse()?); if elems.len() > 4 { - // If we upgrade TiKV from 3.0.x to 4.0.x and higher version, we can not read cf_name from - // the file path, because TiKV 3.0.x does not encode cf_name to path. + // If we upgrade TiKV from 3.0.x to 4.0.x and higher version, we can not read + // cf_name from the file path, because TiKV 3.0.x does not encode + // cf_name to path. meta.set_cf_name(elems[4].to_owned()); } Ok(meta) @@ -497,7 +516,7 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = path_to_sst_meta(&path).unwrap(); + let new_meta = path_to_sst_meta(path).unwrap(); assert_eq!(meta, new_meta); } } diff --git a/components/sst_importer/src/import_mode.rs b/components/sst_importer/src/import_mode.rs index 3123ed66da5..5f5b5d1060e 100644 --- a/components/sst_importer/src/import_mode.rs +++ b/components/sst_importer/src/import_mode.rs @@ -8,27 +8,27 @@ use std::{ time::{Duration, Instant}, }; -use engine_traits::{ColumnFamilyOptions, DBOptions, KvEngine}; -use futures::executor::ThreadPool; +use engine_traits::{CfOptions, DbOptions, KvEngine}; use futures_util::compat::Future01CompatExt; use kvproto::import_sstpb::*; use tikv_util::timer::GLOBAL_TIMER_HANDLE; +use tokio::runtime::Handle; use super::{Config, Result}; -pub type RocksDBMetricsFn = fn(cf: &str, name: &str, v: f64); +pub type RocksDbMetricsFn = fn(cf: &str, name: &str, v: f64); struct ImportModeSwitcherInner { is_import: Arc, - backup_db_options: ImportModeDBOptions, - backup_cf_options: Vec<(String, ImportModeCFOptions)>, + backup_db_options: ImportModeDbOptions, + backup_cf_options: Vec<(String, ImportModeCfOptions)>, timeout: Duration, next_check: Instant, - metrics_fn: RocksDBMetricsFn, + metrics_fn: RocksDbMetricsFn, } impl ImportModeSwitcherInner { - fn enter_normal_mode(&mut self, db: &E, mf: RocksDBMetricsFn) -> Result { + fn enter_normal_mode(&mut self, db: &E, mf: RocksDbMetricsFn) -> Result { if !self.is_import.load(Ordering::Acquire) { return Ok(false); } @@ -43,18 +43,18 @@ impl ImportModeSwitcherInner { Ok(true) } - fn enter_import_mode(&mut self, db: &E, mf: RocksDBMetricsFn) -> Result { + fn enter_import_mode(&mut self, db: &E, mf: RocksDbMetricsFn) -> Result { if self.is_import.load(Ordering::Acquire) { return Ok(false); } - self.backup_db_options = ImportModeDBOptions::new_options(db); + self.backup_db_options = ImportModeDbOptions::new_options(db); self.backup_cf_options.clear(); let import_db_options = self.backup_db_options.optimized_for_import_mode(); import_db_options.set_options(db)?; for cf_name in db.cf_names() { - let cf_opts = ImportModeCFOptions::new_options(db, cf_name); + let cf_opts = ImportModeCfOptions::new_options(db, cf_name); let import_cf_options = cf_opts.optimized_for_import_mode(); self.backup_cf_options.push((cf_name.to_owned(), cf_opts)); import_cf_options.set_options(db, cf_name, mf)?; @@ -79,7 +79,7 @@ impl ImportModeSwitcher { let is_import = Arc::new(AtomicBool::new(false)); let inner = Arc::new(Mutex::new(ImportModeSwitcherInner { is_import: is_import.clone(), - backup_db_options: ImportModeDBOptions::new(), + backup_db_options: ImportModeDbOptions::new(), backup_cf_options: Vec::new(), timeout, next_check: Instant::now() + timeout, @@ -88,7 +88,7 @@ impl ImportModeSwitcher { ImportModeSwitcher { inner, is_import } } - pub fn start(&self, executor: &ThreadPool, db: E) { + pub fn start(&self, executor: &Handle, db: E) { // spawn a background future to put TiKV back into normal mode after timeout let inner = self.inner.clone(); let switcher = Arc::downgrade(&inner); @@ -117,17 +117,17 @@ impl ImportModeSwitcher { } } }; - executor.spawn_ok(timer_loop); + executor.spawn(timer_loop); } - pub fn enter_normal_mode(&self, db: &E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_normal_mode(&self, db: &E, mf: RocksDbMetricsFn) -> Result { if !self.is_import.load(Ordering::Acquire) { return Ok(false); } self.inner.lock().unwrap().enter_normal_mode(db, mf) } - pub fn enter_import_mode(&self, db: &E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_import_mode(&self, db: &E, mf: RocksDbMetricsFn) -> Result { let mut inner = self.inner.lock().unwrap(); let ret = inner.enter_import_mode(db, mf)?; inner.next_check = Instant::now() + inner.timeout; @@ -144,11 +144,11 @@ impl ImportModeSwitcher { } } -struct ImportModeDBOptions { +struct ImportModeDbOptions { max_background_jobs: i32, } -impl ImportModeDBOptions { +impl ImportModeDbOptions { fn new() -> Self { Self { max_background_jobs: 32, @@ -161,9 +161,9 @@ impl ImportModeDBOptions { } } - fn new_options(db: &impl KvEngine) -> ImportModeDBOptions { + fn new_options(db: &impl KvEngine) -> ImportModeDbOptions { let db_opts = db.get_db_options(); - ImportModeDBOptions { + ImportModeDbOptions { max_background_jobs: db_opts.get_max_background_jobs(), } } @@ -179,14 +179,14 @@ impl ImportModeDBOptions { } } -struct ImportModeCFOptions { - level0_stop_writes_trigger: u32, - level0_slowdown_writes_trigger: u32, +struct ImportModeCfOptions { + level0_stop_writes_trigger: i32, + level0_slowdown_writes_trigger: i32, soft_pending_compaction_bytes_limit: u64, hard_pending_compaction_bytes_limit: u64, } -impl ImportModeCFOptions { +impl ImportModeCfOptions { fn optimized_for_import_mode(&self) -> Self { Self { level0_stop_writes_trigger: self.level0_stop_writes_trigger.max(1 << 30), @@ -196,10 +196,10 @@ impl ImportModeCFOptions { } } - fn new_options(db: &impl KvEngine, cf_name: &str) -> ImportModeCFOptions { + fn new_options(db: &impl KvEngine, cf_name: &str) -> ImportModeCfOptions { let cf_opts = db.get_options_cf(cf_name).unwrap(); //FIXME unwrap - ImportModeCFOptions { + ImportModeCfOptions { level0_stop_writes_trigger: cf_opts.get_level_zero_stop_writes_trigger(), level0_slowdown_writes_trigger: cf_opts.get_level_zero_slowdown_writes_trigger(), soft_pending_compaction_bytes_limit: cf_opts.get_soft_pending_compaction_bytes_limit(), @@ -207,7 +207,7 @@ impl ImportModeCFOptions { } } - fn set_options(&self, db: &impl KvEngine, cf_name: &str, mf: RocksDBMetricsFn) -> Result<()> { + fn set_options(&self, db: &impl KvEngine, cf_name: &str, mf: RocksDbMetricsFn) -> Result<()> { let opts = [ ( "level0_stop_writes_trigger".to_owned(), @@ -242,8 +242,7 @@ impl ImportModeCFOptions { mod tests { use std::thread; - use engine_traits::KvEngine; - use futures::executor::ThreadPoolBuilder; + use engine_traits::{KvEngine, CF_DEFAULT}; use tempfile::Builder; use test_sst_importer::{new_test_engine, new_test_engine_with_options}; use tikv_util::config::ReadableDuration; @@ -252,8 +251,8 @@ mod tests { fn check_import_options( db: &E, - expected_db_opts: &ImportModeDBOptions, - expected_cf_opts: &ImportModeCFOptions, + expected_db_opts: &ImportModeDbOptions, + expected_cf_opts: &ImportModeCfOptions, ) where E: KvEngine, { @@ -290,11 +289,11 @@ mod tests { .prefix("test_import_mode_switcher") .tempdir() .unwrap(); - let db = new_test_engine(temp_dir.path().to_str().unwrap(), &["a", "b"]); + let db = new_test_engine(temp_dir.path().to_str().unwrap(), &[CF_DEFAULT, "a", "b"]); - let normal_db_options = ImportModeDBOptions::new_options(&db); + let normal_db_options = ImportModeDbOptions::new_options(&db); let import_db_options = normal_db_options.optimized_for_import_mode(); - let normal_cf_options = ImportModeCFOptions::new_options(&db, "default"); + let normal_cf_options = ImportModeCfOptions::new_options(&db, "default"); let import_cf_options = normal_cf_options.optimized_for_import_mode(); assert!( @@ -306,14 +305,13 @@ mod tests { fn mf(_cf: &str, _name: &str, _v: f64) {} let cfg = Config::default(); - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .create() + let threads = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() .unwrap(); let switcher = ImportModeSwitcher::new(&cfg); - switcher.start(&threads, db.clone()); + switcher.start(threads.handle(), db.clone()); check_import_options(&db, &normal_db_options, &normal_cf_options); assert!(switcher.enter_import_mode(&db, mf).unwrap()); check_import_options(&db, &import_db_options, &import_cf_options); @@ -331,11 +329,11 @@ mod tests { .prefix("test_import_mode_timeout") .tempdir() .unwrap(); - let db = new_test_engine(temp_dir.path().to_str().unwrap(), &["a", "b"]); + let db = new_test_engine(temp_dir.path().to_str().unwrap(), &[CF_DEFAULT, "a", "b"]); - let normal_db_options = ImportModeDBOptions::new_options(&db); + let normal_db_options = ImportModeDbOptions::new_options(&db); let import_db_options = normal_db_options.optimized_for_import_mode(); - let normal_cf_options = ImportModeCFOptions::new_options(&db, "default"); + let normal_cf_options = ImportModeCfOptions::new_options(&db, "default"); let import_cf_options = normal_cf_options.optimized_for_import_mode(); fn mf(_cf: &str, _name: &str, _v: f64) {} @@ -344,19 +342,20 @@ mod tests { import_mode_timeout: ReadableDuration::millis(300), ..Config::default() }; - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .create() + + let threads = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() .unwrap(); let switcher = ImportModeSwitcher::new(&cfg); - switcher.start(&threads, db.clone()); + switcher.start(threads.handle(), db.clone()); check_import_options(&db, &normal_db_options, &normal_cf_options); switcher.enter_import_mode(&db, mf).unwrap(); check_import_options(&db, &import_db_options, &import_cf_options); thread::sleep(Duration::from_secs(1)); + threads.block_on(tokio::task::yield_now()); check_import_options(&db, &normal_db_options, &normal_cf_options); } @@ -374,7 +373,7 @@ mod tests { |_, opt| opt.set_level_zero_stop_writes_trigger(2_000_000_000), ); - let normal_cf_options = ImportModeCFOptions::new_options(&db, "default"); + let normal_cf_options = ImportModeCfOptions::new_options(&db, "default"); assert_eq!(normal_cf_options.level0_stop_writes_trigger, 2_000_000_000); let import_cf_options = normal_cf_options.optimized_for_import_mode(); assert_eq!(import_cf_options.level0_stop_writes_trigger, 2_000_000_000); diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index ec0222d416a..e073ff941ae 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -19,11 +19,12 @@ mod sst_writer; mod util; #[macro_use] pub mod import_mode; +mod caching; pub mod metrics; pub mod sst_importer; pub use self::{ - config::Config, + config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, import_file::sst_meta_to_path, sst_importer::SstImporter, diff --git a/components/sst_importer/src/metrics.rs b/components/sst_importer/src/metrics.rs index 08f095078d5..2737d592fc0 100644 --- a/components/sst_importer/src/metrics.rs +++ b/components/sst_importer/src/metrics.rs @@ -55,7 +55,12 @@ lazy_static! { pub static ref IMPORTER_DOWNLOAD_BYTES: Histogram = register_histogram!( "tikv_import_download_bytes", "Bucketed histogram of importer download bytes", - exponential_buckets(1024.0, 2.0, 20).unwrap() + exponential_buckets(16.0, 2.0, 20).unwrap() + ).unwrap(); + pub static ref IMPORTER_APPLY_BYTES: Histogram = register_histogram!( + "tikv_import_apply_bytes", + "Bucketed histogram of importer apply bytes", + exponential_buckets(16.0, 2.0, 20).unwrap() ) .unwrap(); pub static ref IMPORTER_INGEST_DURATION: HistogramVec = register_histogram_vec!( @@ -96,4 +101,30 @@ lazy_static! { "Bucketed histogram of importer apply count", &["type"] ).unwrap(); + pub static ref EXT_STORAGE_CACHE_COUNT: IntCounterVec = register_int_counter_vec!( + "tikv_import_storage_cache", + "The operations over storage cache", + &["operation"] + ).unwrap(); + + pub static ref CACHED_FILE_IN_MEM: IntGauge = register_int_gauge!( + "tikv_import_apply_cached_bytes", + "The files cached by the apply requests of importer." + ).unwrap(); + pub static ref CACHE_EVENT: IntCounterVec = register_int_counter_vec!( + "tikv_import_apply_cache_event", + "The events of caching. event = {add, remove, out-of-quota, hit}", + &["type"] + ).unwrap(); + pub static ref APPLIER_EVENT: IntCounterVec = register_int_counter_vec!( + "tikv_import_applier_event", + "The events of applier event.", + &["type"] + ).unwrap(); + pub static ref APPLIER_ENGINE_REQUEST_DURATION: HistogramVec = register_histogram_vec!( + "tikv_import_engine_request", + "The request lifetime track of requesting the RaftKv.", + &["type"], + exponential_buckets(0.01, 4.0, 8).unwrap() + ).unwrap(); } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index dc92c405480..907874c6928 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,41 +4,150 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{prelude::*, BufReader}, + io::{self, BufReader, Read}, ops::Bound, path::{Path, PathBuf}, - sync::Arc, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::Duration, }; -use dashmap::DashMap; -use encryption::{encryption_method_to_db_encryption_method, DataKeyManager}; +use dashmap::{mapref::entry::Entry, DashMap}; +use encryption::{to_engine_encryption_method, DataKeyManager}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ name_to_cf, util::check_key_in_range, CfName, EncryptionKeyManager, FileEncryptionInfo, - Iterator, KvEngine, SeekKey, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, - SstWriterBuilder, CF_DEFAULT, CF_WRITE, + IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, + SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; -use file_system::{get_io_rate_limiter, OpenOptions}; -use futures::executor::ThreadPool; +use external_storage_export::{ + compression_reader_dispatcher, encrypt_wrap_reader, ExternalStorage, RestoreConfig, +}; +use file_system::{get_io_rate_limiter, IoType, OpenOptions}; use kvproto::{ brpb::{CipherInfo, StorageBackend}, import_sstpb::*, kvrpcpb::ApiVersion, }; use tikv_util::{ - codec::stream_event::{EventIterator, Iterator as EIterator}, + codec::{ + bytes::{decode_bytes_in_place, encode_bytes}, + stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + }, + sys::{thread::ThreadBuildWrapper, SysQuota}, time::{Instant, Limiter}, + HandyRwLock, +}; +use tokio::{ + runtime::{Handle, Runtime}, + sync::OnceCell, }; use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ + caching::cache_map::{CacheMap, ShareOwned}, import_file::{ImportDir, ImportFile}, - import_mode::{ImportModeSwitcher, RocksDBMetricsFn}, + import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, metrics::*, sst_writer::{RawSstWriter, TxnSstWriter}, - Config, Error, Result, + util, Config, ConfigManager as ImportConfigManager, Error, Result, }; +pub struct LoadedFile { + permit: MemUsePermit, + content: Arc<[u8]>, +} + +impl std::fmt::Debug for LoadedFile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LoadedFileInner") + .field("permit", &self.permit) + .field("content.len()", &self.content.len()) + .finish() + } +} + +impl ShareOwned for LoadedFile { + type Shared = Arc<[u8]>; + + fn share_owned(&self) -> Self::Shared { + Arc::clone(&self.content) + } +} + +#[derive(Default, Debug, Clone)] +pub struct DownloadExt<'a> { + cache_key: Option<&'a str>, + req_type: DownloadRequestType, +} + +impl<'a> DownloadExt<'a> { + pub fn cache_key(mut self, key: &'a str) -> Self { + self.cache_key = Some(key); + self + } + + pub fn req_type(mut self, req_type: DownloadRequestType) -> Self { + self.req_type = req_type; + self + } +} + +#[derive(Debug)] +struct MemUsePermit { + amount: u64, + statistic: Arc, +} + +impl Drop for MemUsePermit { + fn drop(&mut self) { + self.statistic.fetch_sub(self.amount, Ordering::SeqCst); + } +} + +#[derive(Clone, Debug)] +pub enum CacheKvFile { + Mem(Arc>), + Fs(Arc), +} + +/// returns a error indices that we are going to panic in a invalid state. +/// (Rust panic information cannot be send to BR, hence client cannot know +/// what happens, so we pack it into a `Result`.) +fn bug(message: impl std::fmt::Display) -> Error { + Error::Io(std::io::Error::new( + std::io::ErrorKind::Other, + format!("BUG in TiKV: {}", message), + )) +} + +impl CacheKvFile { + // get the ref count of item. + pub fn ref_count(&self) -> usize { + match self { + CacheKvFile::Mem(buff) => { + if let Some(a) = buff.get() { + return Arc::strong_count(&a.content); + } + Arc::strong_count(buff) + } + CacheKvFile::Fs(path) => Arc::strong_count(path), + } + } + + // check the item is expired. + pub fn is_expired(&self, start: &Instant) -> bool { + match self { + // The expired duration for memory is 60s. + CacheKvFile::Mem(_) => start.saturating_elapsed() >= Duration::from_secs(60), + // The expired duration for local file is 10min. + CacheKvFile::Fs(_) => start.saturating_elapsed() >= Duration::from_secs(600), + } + } +} + /// SstImporter manages SST files that are waiting for ingesting. pub struct SstImporter { dir: ImportDir, @@ -47,7 +156,13 @@ pub struct SstImporter { // TODO: lift api_version as a type parameter. api_version: ApiVersion, compression_types: HashMap, - file_locks: Arc>, + + cached_storage: CacheMap, + // We need to keep reference to the runtime so background tasks won't be dropped. + _download_rt: Runtime, + file_locks: Arc>, + mem_use: Arc, + mem_limit: Arc, } impl SstImporter { @@ -58,6 +173,32 @@ impl SstImporter { api_version: ApiVersion, ) -> Result { let switcher = ImportModeSwitcher::new(cfg); + let cached_storage = CacheMap::default(); + // We are going to run some background tasks here, (hyper needs to maintain the + // connection, the cache map needs gc intervally.) so we must create a + // multi-thread runtime, given there isn't blocking, a single thread runtime is + // enough. + let download_rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .thread_name("sst_import_misc") + .after_start_wrapper(|| { + tikv_alloc::add_thread_memory_accessor(); + file_system::set_io_type(IoType::Import); + }) + .before_stop_wrapper(|| { + tikv_alloc::remove_thread_memory_accessor(); + }) + .enable_all() + .build()?; + download_rt.spawn(cached_storage.gc_loop()); + + let memory_limit = Self::calcualte_usage_mem(cfg.memory_use_ratio); + info!( + "sst importer memory limit when apply"; + "ratio" => cfg.memory_use_ratio, + "size" => ?memory_limit, + ); + Ok(SstImporter { dir: ImportDir::new(root)?, key_manager, @@ -65,9 +206,17 @@ impl SstImporter { api_version, compression_types: HashMap::with_capacity(2), file_locks: Arc::new(DashMap::default()), + cached_storage, + _download_rt: download_rt, + mem_use: Arc::new(AtomicU64::new(0)), + mem_limit: Arc::new(AtomicU64::new(memory_limit)), }) } + fn calcualte_usage_mem(mem_ratio: f64) -> u64 { + ((SysQuota::memory_limit_in_bytes() as f64) * mem_ratio) as u64 + } + pub fn set_compression_type( &mut self, cf_name: CfName, @@ -80,7 +229,7 @@ impl SstImporter { } } - pub fn start_switch_mode_check(&self, executor: &ThreadPool, db: E) { + pub fn start_switch_mode_check(&self, executor: &Handle, db: E) { self.switcher.start(executor, db); } @@ -174,7 +323,7 @@ impl SstImporter { // // This method returns the *inclusive* key range (`[start, end]`) of SST // file created, or returns None if the SST is empty. - pub fn download( + pub async fn download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -183,6 +332,7 @@ impl SstImporter { crypter: Option, speed_limiter: Limiter, engine: E, + ext: DownloadExt<'_>, ) -> Result> { debug!("download start"; "meta" => ?meta, @@ -191,7 +341,7 @@ impl SstImporter { "rewrite_rule" => ?rewrite_rule, "speed_limit" => speed_limiter.speed_limit(), ); - match self.do_download::( + let r = self.do_download_ext::( meta, backend, name, @@ -199,7 +349,9 @@ impl SstImporter { crypter, &speed_limiter, engine, - ) { + ext, + ); + match r.await { Ok(r) => { info!("download"; "meta" => ?meta, "name" => name, "range" => ?r); Ok(r) @@ -211,11 +363,11 @@ impl SstImporter { } } - pub fn enter_normal_mode(&self, db: E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_normal_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { self.switcher.enter_normal_mode(&db, mf) } - pub fn enter_import_mode(&self, db: E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_import_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { self.switcher.enter_import_mode(&db, mf) } @@ -223,43 +375,86 @@ impl SstImporter { self.switcher.get_mode() } + #[cfg(test)] fn download_file_from_external_storage( &self, file_length: u64, src_file_name: &str, dst_file: std::path::PathBuf, backend: &StorageBackend, - expect_sha256: Option>, - file_crypter: Option, + support_kms: bool, speed_limiter: &Limiter, + restore_config: external_storage_export::RestoreConfig, ) -> Result<()> { - let start_read = Instant::now(); + self._download_rt + .block_on(self.async_download_file_from_external_storage( + file_length, + src_file_name, + dst_file, + backend, + support_kms, + speed_limiter, + "", + restore_config, + )) + } + + /// Create an external storage by the backend, and cache it with the key. + /// If the cache exists, return it directly. + pub fn external_storage_or_cache( + &self, + backend: &StorageBackend, + cache_id: &str, + ) -> Result> { // prepare to download the file from the external_storage // TODO: pass a config to support hdfs - let ext_storage = external_storage_export::create_storage(backend, Default::default())?; - let url = ext_storage.url()?.to_string(); + let ext_storage = if cache_id.is_empty() { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["skip"]).inc(); + let s = external_storage_export::create_storage(backend, Default::default())?; + Arc::from(s) + } else { + self.cached_storage.cached_or_create(cache_id, backend)? + }; + Ok(ext_storage) + } - let ext_storage: Box = - if let Some(key_manager) = &self.key_manager { - Box::new(external_storage_export::EncryptedExternalStorage { - key_manager: (*key_manager).clone(), - storage: ext_storage, - }) as _ - } else { - ext_storage as _ - }; + async fn async_download_file_from_external_storage( + &self, + file_length: u64, + src_file_name: &str, + dst_file: std::path::PathBuf, + backend: &StorageBackend, + support_kms: bool, + speed_limiter: &Limiter, + cache_key: &str, + restore_config: external_storage_export::RestoreConfig, + ) -> Result<()> { + let start_read = Instant::now(); + if let Some(p) = dst_file.parent() { + file_system::create_dir_all(p).or_else(|e| { + if e.kind() == io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(e) + } + })?; + } - let result = ext_storage.restore( - src_file_name, - dst_file.clone(), - file_length, - expect_sha256, - speed_limiter, - file_crypter, - ); + let ext_storage = self.external_storage_or_cache(backend, cache_key)?; + let ext_storage = self.wrap_kms(ext_storage, support_kms); + + let result = ext_storage + .restore( + src_file_name, + dst_file.clone(), + file_length, + speed_limiter, + restore_config, + ) + .await; IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); result.map_err(|e| Error::CannotReadExternalStorage { - url: url.to_string(), + url: util::url_for(&ext_storage), name: src_file_name.to_owned(), local_path: dst_file.clone(), err: e, @@ -276,19 +471,323 @@ impl SstImporter { debug!("downloaded file succeed"; "name" => src_file_name, - "url" => %url, + "url" => %util::url_for(&ext_storage), ); Ok(()) } - pub fn do_download_kv_file( + pub fn update_config_memory_use_ratio(&self, cfg_mgr: &ImportConfigManager) { + let mem_ratio = cfg_mgr.rl().memory_use_ratio; + let memory_limit = Self::calcualte_usage_mem(mem_ratio); + + if self.mem_limit.load(Ordering::SeqCst) != memory_limit { + self.mem_limit.store(memory_limit, Ordering::SeqCst); + info!("update importer config"; + "memory-use-ratio" => mem_ratio, + "size" => memory_limit, + ) + } + } + + pub fn shrink_by_tick(&self) -> usize { + let mut shrink_buff_size: usize = 0; + let mut retain_buff_size: usize = 0; + let mut shrink_files: Vec = Vec::default(); + let mut retain_file_count = 0_usize; + + self.file_locks.retain(|_, (c, start)| { + let mut need_retain = true; + match c { + CacheKvFile::Mem(buff) => { + let buflen = buff.get().map(|v| v.content.len()).unwrap_or_default(); + // The term of recycle memeory is 60s. + if c.ref_count() == 1 && c.is_expired(start) { + CACHE_EVENT.with_label_values(&["remove"]).inc(); + need_retain = false; + shrink_buff_size += buflen; + } else { + retain_buff_size += buflen; + } + } + CacheKvFile::Fs(path) => { + let p = path.to_path_buf(); + // The term of recycle file is 10min. + if c.ref_count() == 1 && c.is_expired(start) { + need_retain = false; + shrink_files.push(p); + } else { + retain_file_count += 1; + } + } + } + + need_retain + }); + + CACHED_FILE_IN_MEM.set(self.mem_use.load(Ordering::SeqCst) as _); + + if self.import_support_download() { + let shrink_file_count = shrink_files.len(); + if shrink_file_count > 0 || retain_file_count > 0 { + info!("shrink space by tick"; "shrink files count" => shrink_file_count, "retain files count" => retain_file_count); + } + + for f in shrink_files { + if let Err(e) = file_system::remove_file(&f) { + info!("failed to remove file"; "filename" => ?f, "error" => ?e); + } + } + shrink_file_count + } else { + if shrink_buff_size > 0 || retain_buff_size > 0 { + info!("shrink cache by tick"; "shrink size" => shrink_buff_size, "retain size" => retain_buff_size); + } + shrink_buff_size + } + } + + // If mem_limit is 0, which represent download kv-file when import. + // Or read kv-file into buffer directly. + pub fn import_support_download(&self) -> bool { + self.mem_limit.load(Ordering::SeqCst) == 0 + } + + fn request_memory(&self, meta: &KvMeta) -> Option { + let size = meta.get_length(); + let old = self.mem_use.fetch_add(size, Ordering::SeqCst); + + // If the memory is limited, roll backup the mem_use and return false. + if old + size > self.mem_limit.load(Ordering::SeqCst) { + self.mem_use.fetch_sub(size, Ordering::SeqCst); + CACHE_EVENT.with_label_values(&["out-of-quota"]).inc(); + None + } else { + CACHE_EVENT.with_label_values(&["add"]).inc(); + Some(MemUsePermit { + amount: size, + statistic: Arc::clone(&self.mem_use), + }) + } + } + + async fn exec_download( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + speed_limiter: &Limiter, + ) -> Result { + let start = Instant::now(); + let permit = self + .request_memory(meta) + .ok_or_else(|| Error::ResourceNotEnough(String::from("memory is limited")))?; + + let expected_sha256 = { + let sha256 = meta.get_sha256().to_vec(); + if !sha256.is_empty() { + Some(sha256) + } else { + None + } + }; + let file_length = meta.get_length(); + let range = { + let range_length = meta.get_range_length(); + if range_length == 0 { + None + } else { + Some((meta.get_range_offset(), range_length)) + } + }; + let restore_config = external_storage_export::RestoreConfig { + range, + compression_type: Some(meta.get_compression_type()), + expected_sha256, + file_crypter: None, + }; + + let buff = self + .read_kv_files_from_external_storage( + file_length, + meta.get_name(), + ext_storage, + speed_limiter, + restore_config, + ) + .await?; + + IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); + IMPORTER_APPLY_DURATION + .with_label_values(&["exec_download"]) + .observe(start.saturating_elapsed().as_secs_f64()); + + let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; + Ok(LoadedFile { + content: Arc::from(rewrite_buff.into_boxed_slice()), + permit, + }) + } + + pub async fn do_read_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + speed_limiter: &Limiter, + ) -> Result { + let start = Instant::now(); + let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); + + let cache = { + let lock = self.file_locks.entry(dst_name); + IMPORTER_APPLY_DURATION + .with_label_values(&["download-get-lock"]) + .observe(start.saturating_elapsed().as_secs_f64()); + + match lock { + Entry::Occupied(mut ent) => match ent.get_mut() { + (CacheKvFile::Mem(buff), last_used) => { + *last_used = Instant::now(); + Arc::clone(buff) + } + _ => { + return Err(bug(concat!( + "using both read-to-memory and download-to-file is unacceptable for now.", + "(If you think it is possible in the future you are reading this, ", + "please change this line to `return item.get.0.clone()`)", + "(Please also check the state transform is OK too.)", + ))); + } + }, + Entry::Vacant(ent) => { + let cache = Arc::new(OnceCell::new()); + ent.insert((CacheKvFile::Mem(Arc::clone(&cache)), Instant::now())); + cache + } + } + }; + + if cache.initialized() { + CACHE_EVENT.with_label_values(&["hit"]).inc(); + } + + cache + .get_or_try_init(|| self.exec_download(meta, rewrite_rule, ext_storage, speed_limiter)) + .await?; + Ok(CacheKvFile::Mem(cache)) + } + + pub fn wrap_kms( + &self, + ext_storage: Arc, + support_kms: bool, + ) -> Arc { + // kv-files needn't are decrypted with KMS when download currently because these + // files are not encrypted when log-backup. It is different from + // sst-files because sst-files is encrypted when saved with rocksdb env + // with KMS. to do: support KMS when log-backup and restore point. + match (support_kms, self.key_manager.clone()) { + (true, Some(key_manager)) => { + Arc::new(external_storage_export::EncryptedExternalStorage { + key_manager, + storage: ext_storage, + }) + } + _ => ext_storage, + } + } + + async fn read_kv_files_from_external_storage( + &self, + file_length: u64, + file_name: &str, + ext_storage: Arc, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> Result> { + let RestoreConfig { + range, + compression_type, + expected_sha256, + file_crypter, + } = restore_config; + + let mut reader = { + let inner = if let Some((off, len)) = range { + ext_storage.read_part(file_name, off, len) + } else { + ext_storage.read(file_name) + }; + + let inner = compression_reader_dispatcher(compression_type, inner)?; + encrypt_wrap_reader(file_crypter, inner)? + }; + + let r = external_storage_export::read_external_storage_info_buff( + &mut reader, + speed_limiter, + file_length, + expected_sha256, + external_storage_export::MIN_READ_SPEED, + ) + .await; + let url = ext_storage.url()?.to_string(); + let buff = r.map_err(|e| Error::CannotReadExternalStorage { + url: url.to_string(), + name: file_name.to_string(), + err: e, + local_path: PathBuf::default(), + })?; + + Ok(buff) + } + + pub async fn read_from_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + backend: &StorageBackend, + speed_limiter: &Limiter, + ) -> Result> { + let c = if self.import_support_download() { + self.do_download_kv_file(meta, backend, speed_limiter) + .await? + } else { + self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter) + .await? + }; + match c { + // If cache memroy, it has been rewrite, return buffer directly. + CacheKvFile::Mem(buff) => Ok(Arc::clone( + &buff + .get() + .ok_or_else(|| bug("invalid cache state"))? + .content, + )), + // If cache file name, it need to read and rewrite. + CacheKvFile::Fs(path) => { + let file = File::open(path.as_ref())?; + let mut reader = BufReader::new(file); + let mut buffer = Vec::new(); + reader.read_to_end(&mut buffer)?; + + let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; + Ok(Arc::from(rewrite_buff.into_boxed_slice())) + } + } + } + + pub async fn do_download_kv_file( &self, meta: &KvMeta, backend: &StorageBackend, speed_limiter: &Limiter, - ) -> Result { - let name = meta.get_name(); - let path = self.dir.get_import_path(name)?; + ) -> Result { + let offset = meta.get_range_offset(); + let src_name = meta.get_name(); + let dst_name = format!("{}_{}", src_name, offset); + let path = self.dir.get_import_path(&dst_name)?; let start = Instant::now(); let sha256 = meta.get_sha256().to_vec(); let expected_sha256 = if !sha256.is_empty() { @@ -296,147 +795,178 @@ impl SstImporter { } else { None }; - if path.save.exists() { - return Ok(path.save); - } - let lock = self.file_locks.entry(name.to_string()).or_default(); + let mut lock = self + .file_locks + .entry(dst_name) + .or_insert((CacheKvFile::Fs(Arc::new(path.save.clone())), Instant::now())); if path.save.exists() { - return Ok(path.save); + lock.1 = Instant::now(); + return Ok(lock.0.clone()); } - self.download_file_from_external_storage( - // don't check file length after download file for now. + let range_length = meta.get_range_length(); + let range = if range_length == 0 { + None + } else { + Some((offset, range_length)) + }; + let restore_config = external_storage_export::RestoreConfig { + range, + compression_type: Some(meta.compression_type), + expected_sha256, + file_crypter: None, + }; + self.async_download_file_from_external_storage( meta.get_length(), - name, + src_name, path.temp.clone(), backend, - expected_sha256, + false, // don't support encrypt for now. - None, speed_limiter, - )?; - info!("download file finished {}", name); + "", + restore_config, + ) + .await?; + info!( + "download file finished {}, offset {}, length {}", + src_name, + offset, + meta.get_length() + ); if let Some(p) = path.save.parent() { // we have v1 prefix in file name. - file_system::create_dir_all(p)?; + file_system::create_dir_all(p).or_else(|e| { + if e.kind() == io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(e) + } + })?; } - file_system::rename(path.temp, path.save.clone())?; - - drop(lock); - self.file_locks.remove(name); + file_system::rename(path.temp, path.save)?; IMPORTER_APPLY_DURATION .with_label_values(&["download"]) .observe(start.saturating_elapsed().as_secs_f64()); - Ok(path.save) + lock.1 = Instant::now(); + Ok(lock.0.clone()) } - pub fn do_apply_kv_file>( + pub fn rewrite_kv_file( &self, - start_key: &[u8], - end_key: &[u8], - restore_ts: u64, - file_path: P, + file_buff: Vec, rewrite_rule: &RewriteRule, - build_fn: &mut dyn FnMut(Vec, Vec), - ) -> Result> { - // iterator file and performs rewrites and apply. - let file = File::open(&file_path)?; - let mut reader = BufReader::new(file); - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - - let mut event_iter = EventIterator::new(buffer); - + ) -> Result> { let old_prefix = rewrite_rule.get_old_key_prefix(); let new_prefix = rewrite_rule.get_new_key_prefix(); - - let perform_rewrite = old_prefix != new_prefix; + // if old_prefix equals new_prefix, do not need rewrite. + if old_prefix == new_prefix { + return Ok(file_buff); + } // perform iteration and key rewrite. + let mut new_buff = Vec::with_capacity(file_buff.len()); + let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut key = new_prefix.to_vec(); let new_prefix_data_key_len = key.len(); + + let start = Instant::now(); + loop { + if !event_iter.valid() { + break; + } + event_iter.next()?; + + // perform rewrite + let old_key = event_iter.key(); + if !old_key.starts_with(old_prefix) { + return Err(Error::WrongKeyPrefix { + what: "Key in file", + key: old_key.to_vec(), + prefix: old_prefix.to_vec(), + }); + } + key.truncate(new_prefix_data_key_len); + key.extend_from_slice(&old_key[old_prefix.len()..]); + let value = event_iter.value(); + + let encoded = EventEncoder::encode_event(&key, value); + for slice in encoded { + new_buff.append(&mut slice.as_ref().to_owned()); + } + } + + IMPORTER_APPLY_DURATION + .with_label_values(&["rewrite"]) + .observe(start.saturating_elapsed().as_secs_f64()); + Ok(new_buff) + } + + pub fn do_apply_kv_file( + &self, + start_key: &[u8], + end_key: &[u8], + start_ts: u64, + restore_ts: u64, + file_buff: Arc<[u8]>, + mut build_fn: impl FnMut(Vec, Vec), + ) -> Result> { + let mut event_iter = EventIterator::new(file_buff.as_ref()); let mut smallest_key = None; let mut largest_key = None; - let mut total_key = 0; let mut ts_not_expected = 0; let mut not_in_range = 0; - let start = Instant::now(); + loop { if !event_iter.valid() { break; } total_key += 1; event_iter.next()?; - INPORTER_APPLY_COUNT.with_label_values(&["key_meet"]).inc(); - let ts = Key::decode_ts_from(event_iter.key())?; - if ts > TimeStamp::new(restore_ts) { + + let key = event_iter.key().to_vec(); + let value = event_iter.value().to_vec(); + let ts = Key::decode_ts_from(&key)?; + if ts < TimeStamp::new(start_ts) || ts > TimeStamp::new(restore_ts) { // we assume the keys in file are sorted by ts. // so if we met the key not satisfy the ts. // we can easily filter the remain keys. ts_not_expected += 1; continue; } - if perform_rewrite { - let old_key = event_iter.key(); - - if !old_key.starts_with(old_prefix) { - return Err(Error::WrongKeyPrefix { - what: "Key in file", - key: old_key.to_vec(), - prefix: old_prefix.to_vec(), - }); - } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); - - debug!( - "perform rewrite new key: {:?}, new key prefix: {:?}, old key prefix: {:?}", - log_wrappers::Value::key(&key), - log_wrappers::Value::key(new_prefix), - log_wrappers::Value::key(old_prefix), - ); - } else { - key = event_iter.key().to_vec(); - } if check_key_in_range(&key, 0, start_key, end_key).is_err() { // key not in range, we can simply skip this key here. - // the client make sure the correct region will download and apply the same file. + // the client make sure the correct region will download and apply the same + // file. INPORTER_APPLY_COUNT .with_label_values(&["key_not_in_region"]) .inc(); not_in_range += 1; continue; } - let value = event_iter.value().to_vec(); - build_fn(key.clone(), value); - - let iter_key = key.clone(); - smallest_key = smallest_key.map_or_else( - || Some(iter_key.clone()), - |v: Vec| Some(v.min(iter_key.clone())), - ); - largest_key = largest_key.map_or_else( - || Some(iter_key.clone()), - |v: Vec| Some(v.max(iter_key.clone())), - ); + build_fn(key.clone(), value); + smallest_key = smallest_key + .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.min(key.clone()))); + largest_key = largest_key + .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.max(key.clone()))); } - info!("build download request file done"; "total keys" => %total_key, + if total_key != not_in_range { + info!("build download request file done"; "total keys" => %total_key, "ts filtered keys" => %ts_not_expected, - "range filtered keys" => %not_in_range, - "file" => %file_path.as_ref().display()); + "range filtered keys" => %not_in_range); + } - let label = if perform_rewrite { "rewrite" } else { "normal" }; IMPORTER_APPLY_DURATION - .with_label_values(&[label]) + .with_label_values(&["normal"]) .observe(start.saturating_elapsed().as_secs_f64()); match (smallest_key, largest_key) { @@ -450,7 +980,31 @@ impl SstImporter { } } - fn do_download( + // raw download, without ext, compatibility to old tests. + #[cfg(test)] + fn download( + &self, + meta: &SstMeta, + backend: &StorageBackend, + name: &str, + rewrite_rule: &RewriteRule, + crypter: Option, + speed_limiter: Limiter, + engine: E, + ) -> Result> { + self._download_rt.block_on(self.download_ext( + meta, + backend, + name, + rewrite_rule, + crypter, + speed_limiter, + engine, + DownloadExt::default(), + )) + } + + async fn do_download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -459,24 +1013,32 @@ impl SstImporter { crypter: Option, speed_limiter: &Limiter, engine: E, + ext: DownloadExt<'_>, ) -> Result> { let path = self.dir.join(meta)?; let file_crypter = crypter.map(|c| FileEncryptionInfo { - method: encryption_method_to_db_encryption_method(c.cipher_type), + method: to_engine_encryption_method(c.cipher_type), key: c.cipher_key, iv: meta.cipher_iv.to_owned(), }); - self.download_file_from_external_storage( + let restore_config = external_storage_export::RestoreConfig { + file_crypter, + ..Default::default() + }; + + self.async_download_file_from_external_storage( meta.length, name, path.temp.clone(), backend, - None, - file_crypter, + true, speed_limiter, - )?; + ext.cache_key.unwrap_or(""), + restore_config, + ) + .await?; // now validate the SST file. let env = get_env(self.key_manager.clone(), get_io_rate_limiter())?; @@ -485,16 +1047,20 @@ impl SstImporter { let sst_reader = RocksSstReader::open_with_env(dst_file_name, Some(env))?; sst_reader.verify_checksum()?; + // undo key rewrite so we could compare with the keys inside SST + let old_prefix = rewrite_rule.get_old_key_prefix(); + let new_prefix = rewrite_rule.get_new_key_prefix(); + let req_type = ext.req_type; + debug!("downloaded file and verified"; "meta" => ?meta, "name" => name, "path" => dst_file_name, + "old_prefix" => log_wrappers::Value::key(old_prefix), + "new_prefix" => log_wrappers::Value::key(new_prefix), + "req_type" => ?req_type, ); - // undo key rewrite so we could compare with the keys inside SST - let old_prefix = rewrite_rule.get_old_key_prefix(); - let new_prefix = rewrite_rule.get_new_key_prefix(); - let range_start = meta.get_range().get_start(); let range_end = meta.get_range().get_end(); let range_start_bound = key_to_bound(range_start); @@ -504,14 +1070,14 @@ impl SstImporter { key_to_bound(range_end) }; - let range_start = + let mut range_start = keys::rewrite::rewrite_prefix_of_start_bound(new_prefix, old_prefix, range_start_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST start range", key: range_start.to_vec(), prefix: new_prefix.to_vec(), })?; - let range_end = + let mut range_end = keys::rewrite::rewrite_prefix_of_end_bound(new_prefix, old_prefix, range_end_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST end range", @@ -519,10 +1085,15 @@ impl SstImporter { prefix: new_prefix.to_vec(), })?; + if req_type == DownloadRequestType::Keyspace { + range_start = keys::rewrite::encode_bound(range_start); + range_end = keys::rewrite::encode_bound(range_end); + } + let start_rename_rewrite = Instant::now(); // read the first and last keys from the SST, determine if we could // simply move the entire SST instead of iterating and generate a new one. - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default())?; let direct_retval = (|| -> Result> { if rewrite_rule.old_key_prefix != rewrite_rule.new_key_prefix || rewrite_rule.new_timestamp != 0 @@ -530,10 +1101,16 @@ impl SstImporter { // must iterate if we perform key rewrite return Ok(None); } - if !iter.seek(SeekKey::Start)? { + if !iter.seek_to_first()? { + let mut range = meta.get_range().clone(); + if req_type == DownloadRequestType::Keyspace { + *range.mut_start() = encode_bytes(&range.take_start()); + *range.mut_end() = encode_bytes(&range.take_end()); + } // the SST is empty, so no need to iterate at all (should be impossible?) - return Ok(Some(meta.get_range().clone())); + return Ok(Some(range)); } + let start_key = keys::origin_key(iter.key()); if is_before_start_bound(start_key, &range_start) { // SST's start is before the range to consume, so needs to iterate to skip over @@ -542,14 +1119,15 @@ impl SstImporter { let start_key = start_key.to_vec(); // seek to end and fetch the last (inclusive) key of the SST. - iter.seek(SeekKey::End)?; + iter.seek_to_last()?; let last_key = keys::origin_key(iter.key()); if is_after_end_bound(last_key, &range_end) { // SST's end is after the range to consume return Ok(None); } - // range contained the entire SST, no need to iterate, just moving the file is ok + // range contained the entire SST, no need to iterate, just moving the file is + // ok let mut range = Range::default(); range.set_start(start_key); range.set_end(last_key.to_vec()); @@ -557,7 +1135,6 @@ impl SstImporter { })()?; if let Some(range) = direct_retval { - file_system::rename(&path.temp, &path.save)?; if let Some(key_manager) = &self.key_manager { let temp_str = path .temp @@ -568,7 +1145,14 @@ impl SstImporter { .to_str() .ok_or_else(|| Error::InvalidSstPath(path.save.clone()))?; key_manager.link_file(temp_str, save_str)?; - key_manager.delete_file(temp_str)?; + let r = file_system::rename(&path.temp, &path.save); + let del_file = if r.is_ok() { temp_str } else { save_str }; + if let Err(e) = key_manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'do_download'"; "err" => ?e); + } + r?; + } else { + file_system::rename(&path.temp, &path.save)?; } IMPORTER_DOWNLOAD_DURATION .with_label_values(&["rename"]) @@ -577,13 +1161,15 @@ impl SstImporter { } // perform iteration and key rewrite. - let mut key = keys::data_key(new_prefix); - let new_prefix_data_key_len = key.len(); + let mut data_key = keys::DATA_PREFIX_KEY.to_vec(); + let data_key_prefix_len = keys::DATA_PREFIX_KEY.len(); + let mut user_key = new_prefix.to_vec(); + let user_key_prefix_len = new_prefix.len(); let mut first_key = None; match range_start { - Bound::Unbounded => iter.seek(SeekKey::Start)?, - Bound::Included(s) => iter.seek(SeekKey::Key(&keys::data_key(&s)))?, + Bound::Unbounded => iter.seek_to_first()?, + Bound::Included(s) => iter.seek(&keys::data_key(&s))?, Bound::Excluded(_) => unreachable!(), }; // SST writer must not be opened in gRPC threads, because it may be @@ -598,10 +1184,22 @@ impl SstImporter { .unwrap(); while iter.valid()? { - let old_key = keys::origin_key(iter.key()); - if is_after_end_bound(old_key, &range_end) { + let mut old_key = Cow::Borrowed(keys::origin_key(iter.key())); + let mut ts = None; + + if is_after_end_bound(old_key.as_ref(), &range_end) { break; } + + if req_type == DownloadRequestType::Keyspace { + ts = Some(Key::decode_ts_bytes_from(old_key.as_ref())?.to_owned()); + old_key = { + let mut key = old_key.to_vec(); + decode_bytes_in_place(&mut key, false)?; + Cow::Owned(key) + }; + } + if !old_key.starts_with(old_prefix) { return Err(Error::WrongKeyPrefix { what: "Key in SST", @@ -609,12 +1207,21 @@ impl SstImporter { prefix: old_prefix.to_vec(), }); } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); + + data_key.truncate(data_key_prefix_len); + user_key.truncate(user_key_prefix_len); + user_key.extend_from_slice(&old_key[old_prefix.len()..]); + if req_type == DownloadRequestType::Keyspace { + data_key.extend(encode_bytes(&user_key)); + data_key.extend(ts.unwrap()); + } else { + data_key.extend_from_slice(&user_key); + } + let mut value = Cow::Borrowed(iter.value()); if rewrite_rule.new_timestamp != 0 { - key = Key::from_encoded(key) + data_key = Key::from_encoded(data_key) .truncate_ts() .map_err(|e| { Error::BadFormat(format!( @@ -638,10 +1245,10 @@ impl SstImporter { } } - sst_writer.put(&key, &value)?; + sst_writer.put(&data_key, &value)?; iter.next()?; if first_key.is_none() { - first_key = Some(keys::origin_key(&key).to_vec()); + first_key = Some(keys::origin_key(&data_key).to_vec()); } } @@ -660,7 +1267,7 @@ impl SstImporter { let mut final_range = Range::default(); final_range.set_start(start_key); - final_range.set_end(keys::origin_key(&key).to_vec()); + final_range.set_end(keys::origin_key(&data_key).to_vec()); Ok(Some(final_range)) } else { // nothing is written: prevents finishing the SST at all. @@ -761,18 +1368,24 @@ fn is_after_end_bound>(value: &[u8], bound: &Bound) -> bool { #[cfg(test)] mod tests { - use std::io; + use std::{ + io::{self, BufWriter, Write}, + ops::Sub, + usize, + }; use engine_traits::{ collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, - SeekKey, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, + RefIterable, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; + use external_storage_export::read_external_storage_info_buff; use file_system::File; + use online_config::{ConfigManager, OnlineConfig}; use openssl::hash::{Hasher, MessageDigest}; use tempfile::Builder; use test_sst_importer::*; use test_util::new_test_key_manager; - use tikv_util::stream::block_on_external_io; + use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io}; use txn_types::{Value, WriteType}; use uuid::Uuid; @@ -796,7 +1409,7 @@ mod tests { check_file_not_exists(&path.clone, key_manager.as_deref()); // Cannot create the same file again. - assert!(dir.create(&meta, key_manager.clone()).is_err()); + dir.create(&meta, key_manager.clone()).unwrap_err(); } // Test ImportDir::delete() @@ -820,7 +1433,7 @@ mod tests { // Test ImportDir::ingest() let db_path = temp_dir.path().join("db"); - let env = get_env(key_manager.clone(), None /*io_rate_limiter*/).unwrap(); + let env = get_env(key_manager.clone(), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), &[CF_DEFAULT], env); let cases = vec![(0, 10), (5, 15), (10, 20), (0, 100)]; @@ -829,7 +1442,7 @@ mod tests { for (i, &range) in cases.iter().enumerate() { let path = temp_dir.path().join(format!("{}.sst", i)); - let (meta, data) = gen_sst_file(&path, range); + let (meta, data) = gen_sst_file(path, range); let mut f = dir.create(&meta, key_manager.clone()).unwrap(); f.append(&data).unwrap(); @@ -885,12 +1498,10 @@ mod tests { let mut f = ImportFile::create(meta.clone(), path.clone(), data_key_manager.clone()).unwrap(); // Cannot create the same file again. - assert!( - ImportFile::create(meta.clone(), path.clone(), data_key_manager.clone()).is_err() - ); + ImportFile::create(meta.clone(), path.clone(), data_key_manager.clone()).unwrap_err(); f.append(data).unwrap(); // Invalid crc32 and length. - assert!(f.finish().is_err()); + f.finish().unwrap_err(); check_file_exists(&path.temp, data_key_manager.as_deref()); check_file_not_exists(&path.save, data_key_manager.as_deref()); } @@ -926,11 +1537,19 @@ mod tests { } } + fn check_file_is_same(path_a: &Path, path_b: &Path) -> bool { + assert!(path_a.exists()); + assert!(path_b.exists()); + + let content_a = file_system::read(path_a).unwrap(); + let content_b = file_system::read(path_b).unwrap(); + content_a == content_b + } + fn new_key_manager_for_test() -> (tempfile::TempDir, Arc) { // test with tde let tmp_dir = tempfile::TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); - assert!(key_manager.is_ok()); (tmp_dir, Arc::new(key_manager.unwrap().unwrap())) } @@ -981,6 +1600,43 @@ mod tests { }) } + fn create_sample_external_kv_file() + -> Result<(tempfile::TempDir, StorageBackend, KvMeta, Vec)> { + let ext_dir = tempfile::tempdir()?; + let file_name = "v1/t000001/abc.log"; + let file_path = ext_dir.path().join(file_name); + std::fs::create_dir_all(file_path.parent().unwrap())?; + let file = File::create(file_path).unwrap(); + let mut buff = BufWriter::new(file); + + let kvs = vec![ + (b"t1_r01".to_vec(), b"tidb".to_vec()), + (b"t1_r02".to_vec(), b"tikv".to_vec()), + (b"t1_r03".to_vec(), b"pingcap".to_vec()), + (b"t1_r04".to_vec(), b"test for PITR".to_vec()), + ]; + + let mut sha256 = Hasher::new(MessageDigest::sha256()).unwrap(); + let mut len = 0; + for kv in kvs { + let encoded = EventEncoder::encode_event(&kv.0, &kv.1); + for slice in encoded { + len += buff.write(slice.as_ref()).unwrap(); + sha256.update(slice.as_ref()).unwrap(); + } + } + + let mut kv_meta = KvMeta::default(); + kv_meta.set_name(file_name.to_string()); + kv_meta.set_cf(String::from("default")); + kv_meta.set_is_delete(false); + kv_meta.set_length(len as _); + kv_meta.set_sha256(sha256.finish().unwrap().to_vec()); + + let backend = external_storage_export::make_local_backend(ext_dir.path()); + Ok((ext_dir, backend, kv_meta, buff.buffer().to_vec())) + } + fn create_sample_external_rawkv_sst_file( start_key: &[u8], end_key: &[u8], @@ -1156,6 +1812,379 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::TimedOut); } + #[test] + fn test_read_external_storage_info_buff() { + let data = &b"input some data, used to test read buff"[..]; + let mut reader = data; + let len = reader.len() as _; + let sha_256 = { + let mut hasher = Hasher::new(MessageDigest::sha256()).unwrap(); + hasher.update(data).unwrap(); + hasher.finish().unwrap().to_vec() + }; + + // test successfully. + let output = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + Some(sha_256.clone()), + 0, + )) + .unwrap(); + assert_eq!(&output, data); + + // test without expected_sha245. + reader = data; + let output = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + None, + 0, + )) + .unwrap(); + assert_eq!(&output, data); + + // test with wrong expectd_len. + reader = data; + let err = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len + 1, + Some(sha_256.clone()), + 0, + )) + .unwrap_err(); + assert!(err.to_string().contains("length not match")); + + // test with wrong expected_sha256. + reader = data; + let err = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + Some(sha_256[..sha_256.len() - 1].to_vec()), + 0, + )) + .unwrap_err(); + assert!(err.to_string().contains("sha256 not match")); + } + + #[test] + fn test_read_external_storage_info_buff_timed_out() { + use futures_util::stream::{pending, TryStreamExt}; + + let mut input = pending::>().into_async_read(); + let err = block_on_external_io(read_external_storage_info_buff( + &mut input, + &Limiter::new(f64::INFINITY), + 0, + None, + usize::MAX, + )) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::TimedOut); + } + + #[test] + fn test_update_config_memory_use_ratio() { + // create SstImpoter with default. + let cfg = Config { + memory_use_ratio: 0.3, + ..Default::default() + }; + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new(&cfg, import_dir, None, ApiVersion::V1).unwrap(); + let mem_limit_old = importer.mem_limit.load(Ordering::SeqCst); + + // create new config and get the diff config. + let cfg_new = Config { + memory_use_ratio: 0.1, + ..Default::default() + }; + let change = cfg.diff(&cfg_new); + + // create config manager and update config. + let mut cfg_mgr = ImportConfigManager::new(cfg); + cfg_mgr.dispatch(change).unwrap(); + importer.update_config_memory_use_ratio(&cfg_mgr); + + let mem_limit_new = importer.mem_limit.load(Ordering::SeqCst); + assert!(mem_limit_old > mem_limit_new); + assert_eq!( + mem_limit_old / 3, + mem_limit_new, + "mem_limit_old / 3 = {} mem_limit_new = {}", + mem_limit_old / 3, + mem_limit_new + ); + } + + #[test] + fn test_update_config_with_invalid_conifg() { + let cfg = Config::default(); + let cfg_new = Config { + memory_use_ratio: -0.1, + ..Default::default() + }; + let change = cfg.diff(&cfg_new); + let mut cfg_mgr = ImportConfigManager::new(cfg); + let r = cfg_mgr.dispatch(change); + assert!(r.is_err()); + } + + #[test] + fn test_do_read_kv_file() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + let ext_storage = { + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); + inner + }; + + // test do_read_kv_file() + let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let output = block_on_external_io(importer.do_read_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &Limiter::new(f64::INFINITY), + )) + .unwrap(); + + assert!( + matches!(output.clone(), CacheKvFile::Mem(rc) if &*rc.get().unwrap().content == buff.as_slice()), + "{:?}", + output + ); + + // Do not shrint nothing. + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, 0); + assert_eq!(importer.file_locks.len(), 1); + + // drop the refcnt + drop(output); + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, 0); + assert_eq!(importer.file_locks.len(), 1); + + // set expired instance in Dashmap + for mut kv in importer.file_locks.iter_mut() { + kv.1 = Instant::now().sub(Duration::from_secs(61)); + } + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, buff.len()); + assert!(importer.file_locks.is_empty()); + } + + #[test] + fn test_read_kv_files_from_external_storage() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + let ext_storage = { + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); + Arc::new(inner) + }; + + // test read all of the file. + let restore_config = external_storage_export::RestoreConfig { + expected_sha256: Some(kv_meta.get_sha256().to_vec()), + ..Default::default() + }; + + let output = block_on_external_io(importer.read_kv_files_from_external_storage( + kv_meta.get_length(), + kv_meta.get_name(), + ext_storage.clone(), + &Limiter::new(f64::INFINITY), + restore_config, + )) + .unwrap(); + assert_eq!( + buff, + output, + "we are testing addition with {} and {}", + buff.len(), + output.len() + ); + + // test read range of the file. + let (offset, len) = (5, 16); + let restore_config = external_storage_export::RestoreConfig { + range: Some((offset, len)), + ..Default::default() + }; + + let output = block_on_external_io(importer.read_kv_files_from_external_storage( + len, + kv_meta.get_name(), + ext_storage, + &Limiter::new(f64::INFINITY), + restore_config, + )) + .unwrap(); + assert_eq!(&buff[offset as _..(offset + len) as _], &output[..]); + } + + #[test] + fn test_do_download_kv_file() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let cfg = Config { + memory_use_ratio: 0.0, + ..Default::default() + }; + let importer = + SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1).unwrap(); + let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let ext_storage = { + importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ) + }; + let path = importer + .dir + .get_import_path( + format!("{}_{}", kv_meta.get_name(), kv_meta.get_range_offset()).as_str(), + ) + .unwrap(); + + // test do_download_kv_file(). + assert!(importer.import_support_download()); + let output = block_on_external_io(importer.read_from_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &backend, + &Limiter::new(f64::INFINITY), + )) + .unwrap(); + assert_eq!(*output, buff); + check_file_exists(&path.save, None); + + // test shrink nothing. + let shrint_files_cnt = importer.shrink_by_tick(); + assert_eq!(shrint_files_cnt, 0); + + // set expired instance in Dashmap. + for mut kv in importer.file_locks.iter_mut() { + kv.1 = Instant::now().sub(Duration::from_secs(601)); + } + let shrint_files_cnt = importer.shrink_by_tick(); + assert_eq!(shrint_files_cnt, 1); + check_file_not_exists(&path.save, None); + } + + #[test] + fn test_download_file_from_external_storage_for_sst() { + // creates a sample SST file. + let (_ext_sst_dir, backend, meta) = create_sample_external_sst_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager.clone()), + ApiVersion::V1, + ) + .unwrap(); + + // perform download file into .temp dir. + let file_name = "sample.sst"; + let path = importer.dir.get_import_path(file_name).unwrap(); + let restore_config = external_storage_export::RestoreConfig::default(); + importer + .download_file_from_external_storage( + meta.get_length(), + file_name, + path.temp.clone(), + &backend, + true, + &Limiter::new(f64::INFINITY), + restore_config, + ) + .unwrap(); + check_file_exists(&path.temp, Some(&key_manager)); + assert!(!check_file_is_same( + &_ext_sst_dir.path().join(file_name), + &path.temp, + )); + } + + #[test] + fn test_download_file_from_external_storage_for_kv() { + let (_temp_dir, backend, kv_meta, _) = create_sample_external_kv_file().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + + let path = importer.dir.get_import_path(kv_meta.get_name()).unwrap(); + let restore_config = external_storage_export::RestoreConfig { + expected_sha256: Some(kv_meta.get_sha256().to_vec()), + ..Default::default() + }; + importer + .download_file_from_external_storage( + kv_meta.get_length(), + kv_meta.get_name(), + path.temp.clone(), + &backend, + false, + &Limiter::new(f64::INFINITY), + restore_config, + ) + .unwrap(); + + assert!(check_file_is_same( + &_temp_dir.path().join(kv_meta.get_name()), + &path.temp, + )); + } + #[test] fn test_download_sst_no_key_rewrite() { // creates a sample SST file. @@ -1192,8 +2221,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1223,7 +2252,7 @@ mod tests { .unwrap(); let db_path = temp_dir.path().join("db"); - let env = get_env(Some(key_manager), None /*io_rate_limiter*/).unwrap(); + let env = get_env(Some(key_manager), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), DATA_CFS, env.clone()); let range = importer @@ -1251,8 +2280,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), Some(env)); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1299,8 +2328,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1344,8 +2373,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1388,8 +2417,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1453,20 +2482,20 @@ mod tests { meta.set_length(0); // disable validation. meta.set_crc32(0); let meta_info = importer.validate(&meta).unwrap(); - let _ = importer.ingest(&[meta_info.clone()], &db).unwrap(); + importer.ingest(&[meta_info.clone()], &db).unwrap(); // key1 = "zt9102_r01", value1 = "abc", len = 13 // key2 = "zt9102_r04", value2 = "xyz", len = 13 // key3 = "zt9102_r07", value3 = "pqrst", len = 15 // key4 = "zt9102_r13", value4 = "www", len = 13 // total_bytes = (13 + 13 + 15 + 13) + 4 * 8 = 86 - // don't no why each key has extra 8 byte length in raw_key_size(), but it seems tolerable. - // https://docs.rs/rocks/0.1.0/rocks/table_properties/struct.TableProperties.html#method.raw_key_size + // don't no why each key has extra 8 byte length in raw_key_size(), but it seems + // tolerable. https://docs.rs/rocks/0.1.0/rocks/table_properties/struct.TableProperties.html#method.raw_key_size assert_eq!(meta_info.total_bytes, 86); assert_eq!(meta_info.total_kvs, 4); // verifies the DB content is correct. - let mut iter = db.iterator_cf(cf).unwrap(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = db.iterator(cf).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1529,8 +2558,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1573,8 +2602,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1606,8 +2635,8 @@ mod tests { db, ); match &result { - Err(Error::EngineTraits(TraitError::Engine(msg))) if msg.starts_with("Corruption:") => { - } + Err(Error::EngineTraits(TraitError::Engine(s))) + if s.state().starts_with("Corruption:") => {} _ => panic!("unexpected download result: {:?}", result), } } @@ -1708,8 +2737,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1766,8 +2795,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1821,8 +2850,8 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1917,4 +2946,80 @@ mod tests { assert_eq!(sst_reader.compression_name(), expected_compression_name); } } + + #[test] + fn test_import_support_download() { + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + assert_eq!(importer.import_support_download(), false); + + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new( + &Config { + memory_use_ratio: 0.0, + ..Default::default() + }, + import_dir, + None, + ApiVersion::V1, + ) + .unwrap(); + assert_eq!(importer.import_support_download(), true); + } + + #[test] + fn test_inc_mem_and_check() { + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); + + // test inc_mem_and_check() and dec_mem() successfully. + let meta = KvMeta { + length: 100, + ..Default::default() + }; + let check = importer.request_memory(&meta); + assert!(check.is_some()); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), meta.get_length()); + + drop(check); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); + + // test inc_mem_and_check() failed. + let meta = KvMeta { + length: u64::MAX, + ..Default::default() + }; + let check = importer.request_memory(&meta); + assert!(check.is_none()); + } + + #[test] + fn test_dashmap_lock() { + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + + let key = "file1"; + let r = Arc::new(OnceCell::new()); + let value = (CacheKvFile::Mem(r), Instant::now()); + let lock = importer.file_locks.entry(key.to_string()).or_insert(value); + + // test locked by try_entry() + let lock2 = importer.file_locks.try_entry(key.to_string()); + assert!(lock2.is_none()); + let lock2 = importer.file_locks.try_get(key); + assert!(lock2.is_locked()); + + // test unlocked by entry() + drop(lock); + let v = importer.file_locks.get(key).unwrap(); + assert_eq!(v.0.ref_count(), 1); + + let _buff = v.0.clone(); + assert_eq!(v.0.ref_count(), 2); + } } diff --git a/components/sst_importer/src/sst_writer.rs b/components/sst_importer/src/sst_writer.rs index 60fc1b9e2ab..70d30569557 100644 --- a/components/sst_importer/src/sst_writer.rs +++ b/components/sst_importer/src/sst_writer.rs @@ -61,7 +61,7 @@ impl TxnSstWriter { fn check_api_version(&self, key: &[u8]) -> Result<()> { let mode = K::parse_key_mode(key); - if self.api_version == ApiVersion::V2 && mode != KeyMode::Txn && mode != KeyMode::TiDB { + if self.api_version == ApiVersion::V2 && mode != KeyMode::Txn && mode != KeyMode::Tidb { return Err(Error::invalid_key_mode( SstWriterType::Txn, self.api_version, @@ -434,7 +434,7 @@ mod tests { let (mut w, _handle) = new_writer(SstImporter::new_raw_writer, ApiVersion::V1); let mut batch = RawWriteBatch::default(); batch.set_ttl(10); - assert!(w.write(batch).is_err()); + w.write(batch).unwrap_err(); } #[test] @@ -462,7 +462,7 @@ mod tests { let pairs = vec![pair]; batch.set_pairs(pairs.into()); - assert!(w.write(batch).is_err()); + w.write(batch).unwrap_err(); } #[test] @@ -478,7 +478,7 @@ mod tests { let pairs = vec![pair]; batch.set_pairs(pairs.into()); - assert!(w.write(batch.clone()).is_err()); + w.write(batch.clone()).unwrap_err(); // put a valid key let mut pair = Pair::default(); diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index a3a71ba8144..501061e92c0 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -4,13 +4,14 @@ use std::path::Path; use encryption::DataKeyManager; use engine_traits::EncryptionKeyManager; +use external_storage_export::ExternalStorage; use file_system::File; use super::Result; /// Prepares the SST file for ingestion. -/// The purpose is to make the ingestion retryable when using the `move_files` option. -/// Things we need to consider here: +/// The purpose is to make the ingestion retryable when using the `move_files` +/// option. Things we need to consider here: /// 1. We need to access the original file on retry, so we should make a clone /// before ingestion. /// 2. `RocksDB` will modified the global seqno of the ingested file, so we need @@ -32,8 +33,9 @@ pub fn prepare_sst_for_ingestion, Q: AsRef>( if Path::new(clone).exists() { file_system::remove_file(clone).map_err(|e| format!("remove {}: {:?}", clone, e))?; } - // always try to remove the file from key manager because the clean up in rocksdb is not atomic, - // thus the file may be deleted but key in key manager is not. + // always try to remove the file from key manager because the clean up in + // rocksdb is not atomic, thus the file may be deleted but key in key + // manager is not. if let Some(key_manager) = encryption_key_manager { key_manager.delete_file(clone)?; } @@ -63,19 +65,25 @@ pub fn prepare_sst_for_ingestion, Q: AsRef>( Ok(()) } +pub fn url_for(storage: &E) -> String { + storage + .url() + .map(|url| url.to_string()) + .unwrap_or_else(|err| format!("ErrUrl({})", err)) +} + #[cfg(test)] mod tests { use std::{path::Path, sync::Arc}; use encryption::DataKeyManager; use engine_rocks::{ - util::{new_engine, RocksCFOptions}, - RocksColumnFamilyOptions, RocksDBOptions, RocksEngine, RocksSstWriterBuilder, - RocksTitanDBOptions, + util::new_engine_opt, RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstWriterBuilder, + RocksTitanDbOptions, }; use engine_traits::{ - CfName, ColumnFamilyOptions, DBOptions, EncryptionKeyManager, ImportExt, Peekable, - SstWriter, SstWriterBuilder, TitanDBOptions, + CfName, CfOptions, DbOptions, EncryptionKeyManager, ImportExt, Peekable, SstWriter, + SstWriterBuilder, TitanCfOptions, CF_DEFAULT, }; use tempfile::Builder; use test_util::encryption::new_test_key_manager; @@ -115,8 +123,8 @@ mod tests { } fn check_prepare_sst_for_ingestion( - db_opts: Option, - cf_opts: Option>>, + db_opts: Option, + cf_opts: Option>, key_manager: Option<&DataKeyManager>, was_encrypted: bool, ) { @@ -135,10 +143,11 @@ mod tests { let kvs = [("k1", "v1"), ("k2", "v2"), ("k3", "v3")]; - let cf_name = "default"; - let db = new_engine(path_str, db_opts, &[cf_name], cf_opts).unwrap(); + let db_opts = db_opts.unwrap_or_default(); + let cf_opts = cf_opts.unwrap_or_else(|| vec![(CF_DEFAULT, RocksCfOptions::default())]); + let db = new_engine_opt(path_str, db_opts, cf_opts).unwrap(); - gen_sst_with_kvs(&db, cf_name, sst_path.to_str().unwrap(), &kvs); + gen_sst_with_kvs(&db, CF_DEFAULT, sst_path.to_str().unwrap(), &kvs); if was_encrypted { // Add the file to key_manager to simulate an encrypted file. @@ -156,12 +165,12 @@ mod tests { prepare_sst_for_ingestion(&sst_path, &sst_clone, key_manager).unwrap(); check_hard_link(&sst_path, 2); check_hard_link(&sst_clone, 2); - db.ingest_external_file_cf(cf_name, &[sst_clone.to_str().unwrap()]) + db.ingest_external_file_cf(CF_DEFAULT, &[sst_clone.to_str().unwrap()]) .unwrap(); - check_db_with_kvs(&db, cf_name, &kvs); + check_db_with_kvs(&db, CF_DEFAULT, &kvs); assert!(!sst_clone.exists()); - // Since we are not using key_manager in db, simulate the db deleting the file from - // key_manager. + // Since we are not using key_manager in db, simulate the db deleting the file + // from key_manager. if let Some(manager) = key_manager { manager.delete_file(sst_clone.to_str().unwrap()).unwrap(); } @@ -171,34 +180,34 @@ mod tests { prepare_sst_for_ingestion(&sst_path, &sst_clone, key_manager).unwrap(); check_hard_link(&sst_path, 2); check_hard_link(&sst_clone, 1); - db.ingest_external_file_cf(cf_name, &[sst_clone.to_str().unwrap()]) + db.ingest_external_file_cf(CF_DEFAULT, &[sst_clone.to_str().unwrap()]) .unwrap(); - check_db_with_kvs(&db, cf_name, &kvs); + check_db_with_kvs(&db, CF_DEFAULT, &kvs); assert!(!sst_clone.exists()); } #[test] fn test_prepare_sst_for_ingestion() { check_prepare_sst_for_ingestion( - None, None, None, /*key_manager*/ - false, /* was encrypted*/ + None, None, None, // key_manager + false, // was encrypted ); } #[test] fn test_prepare_sst_for_ingestion_titan() { - let mut db_opts = RocksDBOptions::new(); - let mut titan_opts = RocksTitanDBOptions::new(); + let mut db_opts = RocksDbOptions::new(); + let mut titan_opts = RocksTitanDbOptions::new(); // Force all values write out to blob files. titan_opts.set_min_blob_size(0); db_opts.set_titandb_options(&titan_opts); - let mut cf_opts = RocksColumnFamilyOptions::new(); - cf_opts.set_titandb_options(&titan_opts); + let mut cf_opts = RocksCfOptions::new(); + cf_opts.set_titan_cf_options(&titan_opts); check_prepare_sst_for_ingestion( Some(db_opts), - Some(vec![RocksCFOptions::new("default", cf_opts)]), - None, /*key_manager*/ - false, /*was_encrypted*/ + Some(vec![(CF_DEFAULT, cf_opts)]), + None, // key_manager + false, // was_encrypted ); } @@ -207,7 +216,7 @@ mod tests { let tmp_dir = tempfile::TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); let manager = Arc::new(key_manager.unwrap().unwrap()); - check_prepare_sst_for_ingestion(None, None, Some(&manager), false /*was_encrypted*/); + check_prepare_sst_for_ingestion(None, None, Some(&manager), false /* was_encrypted */); } #[test] @@ -215,6 +224,6 @@ mod tests { let tmp_dir = tempfile::TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); let manager = Arc::new(key_manager.unwrap().unwrap()); - check_prepare_sst_for_ingestion(None, None, Some(&manager), true /*was_encrypted*/); + check_prepare_sst_for_ingestion(None, None, Some(&manager), true /* was_encrypted */); } } diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index ea85e329202..1798b50c82b 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -11,24 +11,24 @@ cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] [dependencies] -api_version = { path = "../api_version" } -backup = { path = "../backup" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager" } +api_version = { workspace = true } +backup = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crc64fast = "0.1" -engine_traits = { path = "../engine_traits" } -external_storage_export = { path = "../external_storage/export", default-features = false } -file_system = { path = "../file_system", default-features = false } +engine_traits = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } futures = "0.3" futures-executor = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +kvproto = { workspace = true } protobuf = "2" rand = "0.8" tempfile = "3.0" -test_raftstore = { path = "../test_raftstore" } -tidb_query_common = { path = "../tidb_query_common" } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +test_raftstore = { workspace = true } +tidb_query_common = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index f8f96b34921..34eb6e8aa9e 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -8,12 +8,12 @@ use std::{ time::Duration, }; -use api_version::{dispatch_api_version, KvFormat, RawValue}; +use api_version::{dispatch_api_version, keyspace::KvPair, ApiV1, KvFormat, RawValue}; use backup::Task; use collections::HashMap; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; use external_storage_export::make_local_backend; -use futures::channel::mpsc as future_mpsc; +use futures::{channel::mpsc as future_mpsc, executor::block_on}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{brpb::*, kvrpcpb::*, tikvpb::TikvClient}; use rand::Rng; @@ -24,9 +24,9 @@ use tidb_query_common::storage::{ }; use tikv::{ config::BackupConfig, - coprocessor::{checksum_crc64_xor, dag::TiKvStorage}, + coprocessor::{checksum_crc64_xor, dag::TikvStorage}, storage::{ - kv::{Engine, SnapContext}, + kv::{Engine, LocalTablets, SnapContext}, SnapshotStore, }, }; @@ -52,7 +52,7 @@ pub struct TestSuite { // Retry if encounter error macro_rules! retry_req { - ($call_req: expr, $check_resp: expr, $resp:ident, $retry:literal, $timeout:literal) => { + ($call_req:expr, $check_resp:expr, $resp:ident, $retry:literal, $timeout:literal) => { let start = Instant::now(); let timeout = Duration::from_millis($timeout); let mut tried_times = 0; @@ -73,7 +73,7 @@ impl TestSuite { pub fn new(count: usize, sst_max_size: u64, api_version: ApiVersion) -> TestSuite { let mut cluster = new_server_cluster_with_api_ver(1, count, api_version); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); cluster.run(); let mut endpoints = HashMap::default(); @@ -85,7 +85,7 @@ impl TestSuite { *id, sim.storages[id].clone(), sim.region_info_accessors[id].clone(), - engines.kv.as_inner().clone(), + LocalTablets::Singleton(engines.kv.clone()), BackupConfig { num_threads: 4, batch_size: 8, @@ -94,6 +94,7 @@ impl TestSuite { }, sim.get_concurrency_manager(*id), api_version, + None, ); let mut worker = bg_worker.lazy_build(format!("backup-{}", id)); worker.start(backup_endpoint); @@ -255,7 +256,7 @@ impl TestSuite { let mut batch = Vec::with_capacity(1024); let mut keys = Vec::with_capacity(1024); // Write 50 times to include more different ts. - let batch_size = cmp::min(cmp::max(key_count / 50, 1), 1024); + let batch_size = (key_count / 50).clamp(1, 1024); for _ in 0..versions { let mut j = 0; while j < key_count { @@ -338,7 +339,7 @@ impl TestSuite { let mut total_kvs = 0; let mut total_bytes = 0; let sim = self.cluster.sim.rl(); - let engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); + let mut engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); let snap_ctx = SnapContext { pb_ctx: &self.context, ..Default::default() @@ -353,16 +354,17 @@ impl TestSuite { Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { - storage: TiKvStorage::new(snap_store, false), + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { + storage: TikvStorage::new(snap_store, false), ranges: vec![Range::Interval(IntervalRange::from((start, end)))], scan_backward_in_range: false, is_key_only: false, is_scanned_range_aware: false, }); let digest = crc64fast::Digest::new(); - while let Some((k, v)) = scanner.next().unwrap() { - checksum = checksum_crc64_xor(checksum, digest.clone(), &k, &v); + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); + checksum = checksum_crc64_xor(checksum, digest.clone(), k, v); total_kvs += 1; total_bytes += (k.len() + v.len()) as u64; } @@ -381,7 +383,7 @@ impl TestSuite { let mut total_bytes = 0; let sim = self.cluster.sim.rl(); - let engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); + let mut engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); let snap_ctx = SnapContext { pb_ctx: &self.context, ..Default::default() @@ -391,7 +393,7 @@ impl TestSuite { if !end.is_empty() { iter_opt.set_upper_bound(&end, DATA_KEY_PREFIX_LEN); } - let mut iter = snapshot.iter_cf(cf, iter_opt).unwrap(); + let mut iter = snapshot.iter(cf, iter_opt).unwrap(); if !iter.seek(&start).unwrap() { return (0, 0, 0); diff --git a/components/test_coprocessor/Cargo.toml b/components/test_coprocessor/Cargo.toml index 6a12f16138f..03047d75e87 100644 --- a/components/test_coprocessor/Cargo.toml +++ b/components/test_coprocessor/Cargo.toml @@ -20,18 +20,18 @@ test-engines-panic = [ ] [dependencies] -api_version = { path = "../api_version" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } +api_version = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } +engine_rocks = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } protobuf = "2" -resource_metering = { path = "../resource_metering" } -test_storage = { path = "../test_storage", default-features = false } -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } -tipb = { git = "https://github.com/pingcap/tipb.git" } -txn_types = { path = "../txn_types", default-features = false } +resource_metering = { workspace = true } +test_storage = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +tipb = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_coprocessor/src/dag.rs b/components/test_coprocessor/src/dag.rs index 38476f694f5..76e91cc6ef5 100644 --- a/components/test_coprocessor/src/dag.rs +++ b/components/test_coprocessor/src/dag.rs @@ -15,7 +15,7 @@ use tipb::{ use super::*; -pub struct DAGSelect { +pub struct DagSelect { pub execs: Vec, pub cols: Vec, pub order_by: Vec, @@ -27,8 +27,8 @@ pub struct DAGSelect { pub paging_size: Option, } -impl DAGSelect { - pub fn from(table: &Table) -> DAGSelect { +impl DagSelect { + pub fn from(table: &Table) -> DagSelect { let mut exec = Executor::default(); exec.set_tp(ExecType::TypeTableScan); let mut tbl_scan = TableScan::default(); @@ -38,7 +38,7 @@ impl DAGSelect { tbl_scan.set_columns(columns_info); exec.set_tbl_scan(tbl_scan); - DAGSelect { + DagSelect { execs: vec![exec], cols: table.columns_info(), order_by: vec![], @@ -51,7 +51,7 @@ impl DAGSelect { } } - pub fn from_index(table: &Table, index: &Column) -> DAGSelect { + pub fn from_index(table: &Table, index: &Column) -> DagSelect { let idx = index.index; let mut exec = Executor::default(); exec.set_tp(ExecType::TypeIndexScan); @@ -65,7 +65,7 @@ impl DAGSelect { exec.set_idx_scan(scan); let range = table.get_index_range_all(idx); - DAGSelect { + DagSelect { execs: vec![exec], cols: columns_info.to_vec(), order_by: vec![], @@ -79,13 +79,13 @@ impl DAGSelect { } #[must_use] - pub fn limit(mut self, n: u64) -> DAGSelect { + pub fn limit(mut self, n: u64) -> DagSelect { self.limit = Some(n); self } #[must_use] - pub fn order_by(mut self, col: &Column, desc: bool) -> DAGSelect { + pub fn order_by(mut self, col: &Column, desc: bool) -> DagSelect { let col_offset = offset_for_column(&self.cols, col.id); let mut item = ByItem::default(); let mut expr = Expr::default(); @@ -99,12 +99,12 @@ impl DAGSelect { } #[must_use] - pub fn count(self, col: &Column) -> DAGSelect { + pub fn count(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Count) } #[must_use] - pub fn aggr_col(mut self, col: &Column, aggr_t: ExprType) -> DAGSelect { + pub fn aggr_col(mut self, col: &Column, aggr_t: ExprType) -> DagSelect { let col_offset = offset_for_column(&self.cols, col.id); let mut col_expr = Expr::default(); col_expr.set_field_type(col.as_field_type()); @@ -112,7 +112,8 @@ impl DAGSelect { col_expr.mut_val().encode_i64(col_offset).unwrap(); let mut expr = Expr::default(); let mut expr_ft = col.as_field_type(); - // Avg will contains two auxiliary columns (sum, count) and the sum should be a `Decimal` + // Avg will contains two auxiliary columns (sum, count) and the sum should be a + // `Decimal` if aggr_t == ExprType::Avg || aggr_t == ExprType::Sum { expr_ft.set_tp(0xf6); // FieldTypeTp::NewDecimal } @@ -124,47 +125,47 @@ impl DAGSelect { } #[must_use] - pub fn first(self, col: &Column) -> DAGSelect { + pub fn first(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::First) } #[must_use] - pub fn sum(self, col: &Column) -> DAGSelect { + pub fn sum(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Sum) } #[must_use] - pub fn avg(self, col: &Column) -> DAGSelect { + pub fn avg(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Avg) } #[must_use] - pub fn max(self, col: &Column) -> DAGSelect { + pub fn max(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Max) } #[must_use] - pub fn min(self, col: &Column) -> DAGSelect { + pub fn min(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Min) } #[must_use] - pub fn bit_and(self, col: &Column) -> DAGSelect { + pub fn bit_and(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::AggBitAnd) } #[must_use] - pub fn bit_or(self, col: &Column) -> DAGSelect { + pub fn bit_or(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::AggBitOr) } #[must_use] - pub fn bit_xor(self, col: &Column) -> DAGSelect { + pub fn bit_xor(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::AggBitXor) } #[must_use] - pub fn group_by(mut self, cols: &[&Column]) -> DAGSelect { + pub fn group_by(mut self, cols: &[&Column]) -> DagSelect { for col in cols { let offset = offset_for_column(&self.cols, col.id); let mut expr = Expr::default(); @@ -177,13 +178,13 @@ impl DAGSelect { } #[must_use] - pub fn output_offsets(mut self, output_offsets: Option>) -> DAGSelect { + pub fn output_offsets(mut self, output_offsets: Option>) -> DagSelect { self.output_offsets = output_offsets; self } #[must_use] - pub fn where_expr(mut self, expr: Expr) -> DAGSelect { + pub fn where_expr(mut self, expr: Expr) -> DagSelect { let mut exec = Executor::default(); exec.set_tp(ExecType::TypeSelection); let mut selection = Selection::default(); @@ -194,20 +195,20 @@ impl DAGSelect { } #[must_use] - pub fn desc(mut self, desc: bool) -> DAGSelect { + pub fn desc(mut self, desc: bool) -> DagSelect { self.execs[0].mut_tbl_scan().set_desc(desc); self } #[must_use] - pub fn paging_size(mut self, paging_size: u64) -> DAGSelect { + pub fn paging_size(mut self, paging_size: u64) -> DagSelect { assert_ne!(paging_size, 0); self.paging_size = Some(paging_size); self } #[must_use] - pub fn key_ranges(mut self, key_ranges: Vec) -> DAGSelect { + pub fn key_ranges(mut self, key_ranges: Vec) -> DagSelect { self.key_ranges = key_ranges; self } @@ -276,15 +277,15 @@ impl DAGSelect { } } -pub struct DAGChunkSpliter { +pub struct DagChunkSpliter { chunks: Vec, datums: Vec, col_cnt: usize, } -impl DAGChunkSpliter { - pub fn new(chunks: Vec, col_cnt: usize) -> DAGChunkSpliter { - DAGChunkSpliter { +impl DagChunkSpliter { + pub fn new(chunks: Vec, col_cnt: usize) -> DagChunkSpliter { + DagChunkSpliter { chunks, col_cnt, datums: Vec::with_capacity(0), @@ -292,7 +293,7 @@ impl DAGChunkSpliter { } } -impl Iterator for DAGChunkSpliter { +impl Iterator for DagChunkSpliter { type Item = Vec; fn next(&mut self) -> Option> { diff --git a/components/test_coprocessor/src/fixture.rs b/components/test_coprocessor/src/fixture.rs index c7feacedbfe..5e94d3e47fe 100644 --- a/components/test_coprocessor/src/fixture.rs +++ b/components/test_coprocessor/src/fixture.rs @@ -5,14 +5,14 @@ use std::sync::Arc; use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; use resource_metering::ResourceTagFactory; -use tidb_query_datatype::codec::Datum; +use tidb_query_datatype::codec::{row::v2::CODEC_VERSION, Datum}; use tikv::{ config::CoprReadPoolConfig, coprocessor::{readpool_impl, Endpoint}, read_pool::ReadPool, server::Config, storage::{ - kv::RocksEngine, lock_manager::DummyLockManager, Engine, TestEngineBuilder, + kv::RocksEngine, lock_manager::MockLockManager, Engine, TestEngineBuilder, TestStorageBuilderApiV1, }, }; @@ -67,10 +67,31 @@ pub fn init_data_with_engine_and_commit( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], commit: bool, -) -> (Store, Endpoint) { +) -> (Store, Endpoint, Arc) { init_data_with_details(ctx, engine, tbl, vals, commit, &Config::default()) } +pub fn init_data_with_engine_and_commit_v2_checksum( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint, Arc) { + init_data_with_details_v2_checksum( + ctx, + engine, + tbl, + vals, + commit, + &Config::default(), + with_checksum, + extra_checksum, + ) +} + pub fn init_data_with_details( ctx: Context, engine: E, @@ -78,20 +99,65 @@ pub fn init_data_with_details( vals: &[(i64, Option<&str>, i64)], commit: bool, cfg: &Config, -) -> (Store, Endpoint) { - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) +) -> (Store, Endpoint, Arc) { + init_data_with_details_impl(ctx, engine, tbl, vals, commit, cfg, 0, false, None) +} + +pub fn init_data_with_details_v2_checksum( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + cfg: &Config, + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint, Arc) { + init_data_with_details_impl( + ctx, + engine, + tbl, + vals, + commit, + cfg, + CODEC_VERSION, + with_checksum, + extra_checksum, + ) +} + +fn init_data_with_details_impl( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + cfg: &Config, + codec_ver: u8, + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint, Arc) { + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); let mut store = Store::from_storage(storage); store.begin(); for &(id, name, count) in vals { - store + let mut inserts = store .insert_into(tbl) .set(&tbl["id"], Datum::I64(id)) .set(&tbl["name"], name.map(str::as_bytes).into()) - .set(&tbl["count"], Datum::I64(count)) - .execute_with_ctx(ctx.clone()); + .set(&tbl["count"], Datum::I64(count)); + if codec_ver == CODEC_VERSION { + inserts = inserts + .set_v2(&tbl["id"], id.into()) + .set_v2(&tbl["name"], name.unwrap().into()) + .set_v2(&tbl["count"], count.into()); + inserts.execute_with_v2_checksum(ctx.clone(), with_checksum, extra_checksum); + } else { + inserts.execute_with_ctx(ctx.clone()); + } } if commit { store.commit_with_ctx(ctx); @@ -103,29 +169,59 @@ pub fn init_data_with_details( store.get_engine(), )); let cm = ConcurrencyManager::new(1.into()); + let limiter = Arc::new(QuotaLimiter::default()); let copr = Endpoint::new( cfg, pool.handle(), cm, ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), + limiter.clone(), ); - (store, copr) + (store, copr, limiter) } pub fn init_data_with_commit( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], commit: bool, -) -> (Store, Endpoint) { +) -> (Store, Endpoint, Arc) { let engine = TestEngineBuilder::new().build().unwrap(); init_data_with_engine_and_commit(Context::default(), engine, tbl, vals, commit) } -// This function will create a Product table and initialize with the specified data. +// This function will create a Product table and initialize with the specified +// data. pub fn init_with_data( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], ) -> (Store, Endpoint) { + let (store, endpoint, _) = init_data_with_commit(tbl, vals, true); + (store, endpoint) +} + +// Same as init_with_data except returned values include Arc +pub fn init_with_data_ext( + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], +) -> (Store, Endpoint, Arc) { init_data_with_commit(tbl, vals, true) } + +pub fn init_data_with_commit_v2_checksum( + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint) { + let engine = TestEngineBuilder::new().build().unwrap(); + let (store, endpoint, _) = init_data_with_engine_and_commit_v2_checksum( + Context::default(), + engine, + tbl, + vals, + true, + with_checksum, + extra_checksum, + ); + (store, endpoint) +} diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index a85f75c422e..96f405d8f39 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -6,14 +6,19 @@ use collections::HashMap; use kvproto::kvrpcpb::{Context, IsolationLevel}; use test_storage::SyncTestStorageApiV1; use tidb_query_datatype::{ - codec::{datum, table, Datum}, + codec::{ + data_type::ScalarValue, + datum, + row::v2::encoder_for_test::{Column as ColumnV2, RowEncoder}, + table, Datum, + }, expr::EvalContext, }; use tikv::{ server::gc_worker::GcConfig, storage::{ kv::{Engine, RocksEngine}, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, txn::FixtureStore, SnapshotStore, StorageApiV1, TestStorageBuilderApiV1, }, @@ -26,6 +31,7 @@ pub struct Insert<'a, E: Engine> { store: &'a mut Store, table: &'a Table, values: BTreeMap, + values_v2: BTreeMap, } impl<'a, E: Engine> Insert<'a, E> { @@ -34,6 +40,7 @@ impl<'a, E: Engine> Insert<'a, E> { store, table, values: BTreeMap::new(), + values_v2: BTreeMap::new(), } } @@ -44,10 +51,26 @@ impl<'a, E: Engine> Insert<'a, E> { self } + pub fn set_v2(mut self, col: &Column, value: ScalarValue) -> Self { + assert!(self.table.column_by_id(col.id).is_some()); + self.values_v2.insert(col.id, value); + self + } + pub fn execute(self) -> i64 { self.execute_with_ctx(Context::default()) } + fn prepare_index_kv(&self, handle: &Datum, buf: &mut Vec<(Vec, Vec)>) { + for (&id, idxs) in &self.table.idxs { + let mut v: Vec<_> = idxs.iter().map(|id| self.values[id].clone()).collect(); + v.push(handle.clone()); + let encoded = datum::encode_key(&mut EvalContext::default(), &v).unwrap(); + let idx_key = table::encode_index_seek_key(self.table.id, id, &encoded); + buf.push((idx_key, vec![0])); + } + } + pub fn execute_with_ctx(self, ctx: Context) -> i64 { let handle = self .values @@ -59,13 +82,44 @@ impl<'a, E: Engine> Insert<'a, E> { let values: Vec<_> = self.values.values().cloned().collect(); let value = table::encode_row(&mut EvalContext::default(), values, &ids).unwrap(); let mut kvs = vec![(key, value)]; - for (&id, idxs) in &self.table.idxs { - let mut v: Vec<_> = idxs.iter().map(|id| self.values[id].clone()).collect(); - v.push(handle.clone()); - let encoded = datum::encode_key(&mut EvalContext::default(), &v).unwrap(); - let idx_key = table::encode_index_seek_key(self.table.id, id, &encoded); - kvs.push((idx_key, vec![0])); + self.prepare_index_kv(&handle, &mut kvs); + self.store.put(ctx, kvs); + handle.i64() + } + + pub fn execute_with_v2_checksum( + self, + ctx: Context, + with_checksum: bool, + extra_checksum: Option, + ) -> i64 { + let handle = self + .values + .get(&self.table.handle_id) + .cloned() + .unwrap_or_else(|| Datum::I64(next_id())); + let key = table::encode_row_key(self.table.id, handle.i64()); + let mut columns: Vec = Vec::new(); + for (id, value) in self.values_v2.iter() { + let col_info = self.table.column_by_id(*id).unwrap(); + columns.push(ColumnV2::new_with_ft( + *id, + col_info.as_field_type(), + value.to_owned(), + )); + } + let mut val_buf = Vec::new(); + if with_checksum { + val_buf + .write_row_with_checksum(&mut EvalContext::default(), columns, extra_checksum) + .unwrap(); + } else { + val_buf + .write_row(&mut EvalContext::default(), columns) + .unwrap(); } + let mut kvs = vec![(key, val_buf)]; + self.prepare_index_kv(&handle, &mut kvs); self.store.put(ctx, kvs); handle.i64() } @@ -116,7 +170,7 @@ pub struct Store { impl Store { pub fn new() -> Self { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); Self::from_storage(storage) @@ -130,9 +184,9 @@ impl Default for Store { } impl Store { - pub fn from_storage(storage: StorageApiV1) -> Self { + pub fn from_storage(storage: StorageApiV1) -> Self { Self { - store: SyncTestStorageApiV1::from_storage(storage, GcConfig::default()).unwrap(), + store: SyncTestStorageApiV1::from_storage(0, storage, GcConfig::default()).unwrap(), current_ts: 1.into(), last_committed_ts: TimeStamp::zero(), handles: vec![], diff --git a/components/test_coprocessor/src/table.rs b/components/test_coprocessor/src/table.rs index 91910d4c2bf..af070f62759 100644 --- a/components/test_coprocessor/src/table.rs +++ b/components/test_coprocessor/src/table.rs @@ -88,7 +88,8 @@ impl Table { range } - /// Create a `KeyRange` which select records in the range. The end_handle_id is included. + /// Create a `KeyRange` which select records in the range. The end_handle_id + /// is included. pub fn get_record_range(&self, start_handle_id: i64, end_handle_id: i64) -> KeyRange { let mut range = KeyRange::default(); range.set_start(table::encode_row_key(self.id, start_handle_id)); @@ -103,7 +104,8 @@ impl Table { self.get_record_range(handle_id, handle_id) } - /// Create a `KeyRange` which select all index records of a specified index in current table. + /// Create a `KeyRange` which select all index records of a specified index + /// in current table. pub fn get_index_range_all(&self, idx: i64) -> KeyRange { let mut range = KeyRange::default(); let mut buf = Vec::with_capacity(8); diff --git a/components/test_coprocessor_plugin/example_plugin/Cargo.toml b/components/test_coprocessor_plugin/example_plugin/Cargo.toml index cda1f2fa0c7..6bbc8b25012 100644 --- a/components/test_coprocessor_plugin/example_plugin/Cargo.toml +++ b/components/test_coprocessor_plugin/example_plugin/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "example_plugin" +name = "example_coprocessor_plugin" version = "0.1.0" edition = "2018" publish = false @@ -8,4 +8,4 @@ publish = false crate-type = ["dylib"] [dependencies] -coprocessor_plugin_api = { path = "../../coprocessor_plugin_api" } +coprocessor_plugin_api = { workspace = true } diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index efdc1a5a23c..7747ac1bbc6 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -5,13 +5,16 @@ edition = "2018" publish = false [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } -pd_client = { path = "../pd_client", default-features = false } -security = { path = "../security", default-features = false } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +grpcio = { workspace = true } +kvproto = { workspace = true } +log_wrappers = { workspace = true } +pd_client = { workspace = true } +security = { workspace = true } +slog = { workspace = true } +slog-global = { workspace = true } +tikv_util = { workspace = true } +tokio = { version = "1.0", features = ["full"] } +tokio-stream = "0.1" diff --git a/components/test_pd/src/lib.rs b/components/test_pd/src/lib.rs index 187a899d7fb..bd768e58318 100644 --- a/components/test_pd/src/lib.rs +++ b/components/test_pd/src/lib.rs @@ -1,4 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(slice_group_by)] #[macro_use] extern crate tikv_util; diff --git a/components/test_pd/src/mocker/etcd.rs b/components/test_pd/src/mocker/etcd.rs new file mode 100644 index 00000000000..d0fe3f43e68 --- /dev/null +++ b/components/test_pd/src/mocker/etcd.rs @@ -0,0 +1,299 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::Cell, + collections::{BTreeMap, HashMap}, + ops::Bound, + sync::Arc, +}; + +use futures::lock::Mutex; +use tokio::sync::mpsc::{self, Sender}; +use tokio_stream::wrappers::ReceiverStream; + +use super::Result; + +/// An in-memory, single versioned storage. +/// Emulating some interfaces of etcd for testing. +#[derive(Default, Debug)] +pub struct Etcd { + items: BTreeMap, + subs: HashMap, + revision: i64, + sub_id_alloc: Cell, +} + +pub type EtcdClient = Arc>; + +impl Etcd { + fn alloc_rev(&mut self) -> i64 { + self.revision += 1; + self.revision + } + + pub fn get_revision(&self) -> i64 { + self.revision + } + + pub fn get_key(&self, keys: Keys) -> (Vec, i64) { + let (start_key, end_key) = keys.into_bound(); + let kvs = self + .items + .range(( + Bound::Included(&Key(start_key, 0)), + Bound::Excluded(&Key(end_key, self.revision)), + )) + .collect::>() + .as_slice() + .group_by(|item1, item2| item1.0.0 == item2.0.0) + .filter_map(|group| { + let (k, v) = group.last()?; + match v { + Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), + Value::Del(_) => None, + } + }) + .fold(Vec::new(), |mut items, item| { + items.push(item); + items + }); + + (kvs, self.get_revision()) + } + + pub async fn set(&mut self, mut pair: KeyValue) -> Result<()> { + let rev = self.alloc_rev(); + for sub in self.subs.values() { + if pair.key() < sub.end_key.as_slice() && pair.key() >= sub.start_key.as_slice() { + sub.tx + .send(KvEvent { + kind: KvEventType::Put, + pair: pair.clone(), + }) + .await + .unwrap(); + } + } + self.items + .insert(Key(pair.take_key(), rev), Value::Val(pair.take_value())); + Ok(()) + } + + pub async fn delete(&mut self, keys: Keys) -> Result<()> { + let (start_key, end_key) = keys.into_bound(); + let rev = self.alloc_rev(); + let mut v = self + .items + .range(( + Bound::Included(Key(start_key, 0)), + Bound::Excluded(Key(end_key, self.revision)), + )) + .map(|(k, v)| (Key::clone(k), v.clone())) + .collect::>(); + v.dedup_by(|k1, k2| k1.0 == k2.0); + + for (victim, data) in v { + let k = Key(victim.0.clone(), rev); + let data = data.take_data(); + self.items.insert(k, Value::Del(data.clone())); + + for sub in self.subs.values() { + if victim.0.as_slice() < sub.end_key.as_slice() + && victim.0.as_slice() >= sub.start_key.as_slice() + { + sub.tx + .send(KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(victim.0.clone()), data.clone()), + }) + .await + .unwrap(); + } + } + } + Ok(()) + } + + pub async fn watch(&mut self, keys: Keys, start_rev: i64) -> Result> { + let id = self.sub_id_alloc.get(); + self.sub_id_alloc.set(id + 1); + let (tx, rx) = mpsc::channel(1024); + let (start_key, end_key) = keys.into_bound(); + + // Sending events from [start_rev, now) to the client. + let mut pending = self + .items + .range(( + Bound::Included(Key(start_key.clone(), 0)), + Bound::Excluded(Key(end_key.clone(), self.revision)), + )) + .filter(|(k, _)| k.1 >= start_rev) + .collect::>(); + pending.sort_by_key(|(k, _)| k.1); + for (k, v) in pending { + let event = match v { + Value::Val(val) => KvEvent { + kind: KvEventType::Put, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + Value::Del(val) => KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + }; + tx.send(event).await.expect("too many pending events"); + } + + self.subs.insert( + id, + Subscriber { + start_key, + end_key, + tx, + }, + ); + Ok(ReceiverStream::new(rx)) + } + + pub fn clear_subs(&mut self) { + self.subs.clear(); + self.sub_id_alloc.set(0); + } + + /// A tool for dumpling the whole storage when test failed. + /// Add this to test code temporarily for debugging. + #[allow(dead_code)] + pub fn dump(&self) { + println!(">>>>>>> /etc (revision = {}) <<<<<<<", self.revision); + for (k, v) in self.items.iter() { + println!("{:?} => {:?}", k, v); + } + } +} + +#[derive(Clone, Debug)] +pub struct MetaKey(pub Vec); + +impl MetaKey { + /// return the key that keeps the range [self, self.next()) contains only + /// `self`. + pub fn next(&self) -> Self { + let mut next = self.clone(); + next.0.push(0); + next + } + + /// return the key that keeps the range [self, self.next_prefix()) contains + /// all keys with the prefix `self`. + pub fn next_prefix(&self) -> Self { + let mut next_prefix = self.clone(); + for i in (0..next_prefix.0.len()).rev() { + if next_prefix.0[i] == u8::MAX { + next_prefix.0.pop(); + } else { + next_prefix.0[i] += 1; + break; + } + } + next_prefix + } +} + +/// A simple key value pair of metadata. +#[derive(Clone, Debug)] +pub struct KeyValue(pub MetaKey, pub Vec); + +impl KeyValue { + pub fn key(&self) -> &[u8] { + self.0.0.as_slice() + } + + pub fn value(&self) -> &[u8] { + self.1.as_slice() + } + + pub fn take_key(&mut self) -> Vec { + std::mem::take(&mut self.0.0) + } + + pub fn take_value(&mut self) -> Vec { + std::mem::take(&mut self.1) + } +} + +#[derive(Debug)] +pub enum KvEventType { + Put, + Delete, +} + +#[derive(Debug)] +pub struct KvEvent { + pub kind: KvEventType, + pub pair: KeyValue, +} + +#[derive(Debug)] +struct Subscriber { + start_key: Vec, + end_key: Vec, + tx: Sender, +} + +/// A key with revision. +#[derive(Default, Eq, PartialEq, Ord, PartialOrd, Clone)] +struct Key(Vec, i64); + +impl std::fmt::Debug for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Key") + .field(&format_args!( + "{}@{}", + log_wrappers::Value::key(&self.0), + self.1 + )) + .finish() + } +} + +/// A value (maybe tombstone.) +#[derive(Debug, PartialEq, Clone)] +enum Value { + Val(Vec), + // the value is the last put val. This is used for watch changes. + Del(Vec), +} + +impl Value { + fn take_data(self) -> Vec { + match self { + Value::Val(d) => d, + Value::Del(d) => d, + } + } +} + +/// The key set for getting. +#[derive(Debug)] +pub enum Keys { + Prefix(MetaKey), + Range(MetaKey, MetaKey), + Key(MetaKey), +} + +impl Keys { + /// convert the key set for corresponding key range. + pub fn into_bound(self) -> (Vec, Vec) { + match self { + Keys::Prefix(x) => { + let next = x.next_prefix().0; + ((x.0), (next)) + } + Keys::Range(start, end) => ((start.0), (end.0)), + Keys::Key(k) => { + let next = k.next().0; + ((k.0), (next)) + } + } + } +} diff --git a/components/test_pd/src/mocker/meta_storage.rs b/components/test_pd/src/mocker/meta_storage.rs new file mode 100644 index 00000000000..311c3884722 --- /dev/null +++ b/components/test_pd/src/mocker/meta_storage.rs @@ -0,0 +1,113 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{Arc, Mutex}; + +use futures::{executor::block_on, SinkExt, StreamExt}; +use grpcio::{RpcStatus, RpcStatusCode}; +use kvproto::meta_storagepb as mpb; + +use super::etcd::{Etcd, KeyValue, Keys, KvEventType, MetaKey}; +use crate::PdMocker; + +#[derive(Default)] +pub struct MetaStorage { + store: Arc>, +} + +fn convert_kv(from: KeyValue) -> mpb::KeyValue { + let mut kv = mpb::KeyValue::default(); + kv.set_key(from.0.0); + kv.set_value(from.1); + kv +} + +fn check_header(h: &mpb::RequestHeader) -> super::Result<()> { + if h.get_source().is_empty() { + return Err(format!("Please provide header.source; req = {:?}", h)); + } + Ok(()) +} + +fn header_of_revision(r: i64) -> mpb::ResponseHeader { + let mut h = mpb::ResponseHeader::default(); + h.set_revision(r); + h +} + +impl PdMocker for MetaStorage { + fn meta_store_get(&self, req: mpb::GetRequest) -> Option> { + if let Err(err) = check_header(req.get_header()) { + return Some(Err(err)); + } + + let store = self.store.lock().unwrap(); + let key = if req.get_range_end().is_empty() { + Keys::Key(MetaKey(req.get_key().to_vec())) + } else { + Keys::Range( + MetaKey(req.get_key().to_vec()), + MetaKey(req.get_range_end().to_vec()), + ) + }; + let (items, rev) = store.get_key(key); + let mut resp = mpb::GetResponse::new(); + resp.set_kvs(items.into_iter().map(convert_kv).collect()); + resp.set_header(header_of_revision(rev)); + Some(Ok(resp)) + } + + fn meta_store_put(&self, mut req: mpb::PutRequest) -> Option> { + if let Err(err) = check_header(req.get_header()) { + return Some(Err(err)); + } + + let mut store = self.store.lock().unwrap(); + block_on(store.set(KeyValue(MetaKey(req.take_key()), req.take_value()))).unwrap(); + Some(Ok(Default::default())) + } + + fn meta_store_watch( + &self, + req: mpb::WatchRequest, + mut sink: grpcio::ServerStreamingSink, + ctx: &grpcio::RpcContext<'_>, + ) -> bool { + if let Err(err) = check_header(req.get_header()) { + ctx.spawn(async move { + sink.fail(RpcStatus::with_message( + RpcStatusCode::INVALID_ARGUMENT, + err, + )) + .await + .unwrap() + }); + return true; + } + + let mut store = self.store.lock().unwrap(); + let key = if req.get_range_end().is_empty() { + Keys::Key(MetaKey(req.get_key().to_vec())) + } else { + Keys::Range( + MetaKey(req.get_key().to_vec()), + MetaKey(req.get_range_end().to_vec()), + ) + }; + let mut watcher = + block_on(store.watch(key, req.get_start_revision())).expect("should be infallible"); + ctx.spawn(async move { + while let Some(x) = watcher.next().await { + let mut event = mpb::Event::new(); + event.set_kv(convert_kv(x.pair)); + event.set_type(match x.kind { + KvEventType::Put => mpb::EventEventType::Put, + KvEventType::Delete => mpb::EventEventType::Delete, + }); + let mut resp = mpb::WatchResponse::default(); + resp.set_events(vec![event].into()); + sink.send((resp, Default::default())).await.unwrap(); + } + }); + true + } +} diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index d904c95d4a8..f4b6dafb6b6 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -2,19 +2,24 @@ use std::result; -use kvproto::pdpb::*; +use futures::executor::block_on; +use kvproto::{meta_storagepb as mpb, pdpb::*}; mod bootstrap; +pub mod etcd; mod incompatible; mod leader_change; +mod meta_storage; mod retry; mod service; mod split; +use self::etcd::{EtcdClient, KeyValue, Keys, MetaKey}; pub use self::{ bootstrap::AlreadyBootstrapped, incompatible::Incompatible, leader_change::LeaderChange, + meta_storage::MetaStorage, retry::{NotRetry, Retry}, service::Service, split::Split, @@ -25,31 +30,83 @@ pub const DEFAULT_CLUSTER_ID: u64 = 42; pub type Result = result::Result; pub trait PdMocker { + fn meta_store_get(&self, _req: mpb::GetRequest) -> Option> { + None + } + + fn meta_store_put(&self, _req: mpb::PutRequest) -> Option> { + None + } + + fn meta_store_watch( + &self, + _req: mpb::WatchRequest, + _sink: grpcio::ServerStreamingSink, + _ctx: &grpcio::RpcContext<'_>, + ) -> bool { + false + } + fn load_global_config( &self, - req: &LoadGlobalConfigRequest, + _req: &LoadGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - let mut send = vec![]; - for r in req.get_names() { - let mut i = GlobalConfigItem::default(); - i.set_name(format!("/global/config/{}", r.clone())); - i.set_value(r.clone()); - send.push(i); - } let mut res = LoadGlobalConfigResponse::default(); - res.set_items(send.into()); + let mut items = Vec::new(); + let (resp, revision) = block_on(async move { + etcd_client.lock().await.get_key(Keys::Range( + MetaKey(b"".to_vec()), + MetaKey(b"\xff".to_vec()), + )) + }); + + let values: Vec = resp + .iter() + .map(|kv| { + let mut item = GlobalConfigItem::default(); + item.set_name(String::from_utf8(kv.key().to_vec()).unwrap()); + item.set_payload(kv.value().into()); + item + }) + .collect(); + + items.extend(values); + res.set_revision(revision); + res.set_items(items.into()); Some(Ok(res)) } fn store_global_config( &self, - _: &StoreGlobalConfigRequest, + req: &StoreGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - unimplemented!() + for item in req.get_changes() { + let cli = etcd_client.clone(); + block_on(async move { + match item.get_kind() { + EventType::Put => { + let kv = + KeyValue(MetaKey(item.get_name().into()), item.get_payload().into()); + cli.lock().await.set(kv).await + } + EventType::Delete => { + let key = Keys::Key(MetaKey(item.get_name().into())); + cli.lock().await.delete(key).await + } + } + }) + .unwrap(); + } + Some(Ok(StoreGlobalConfigResponse::default())) } - fn watch_global_config(&self) -> Option> { - panic!("could not mock this function due to it should return a stream") + fn watch_global_config( + &self, + _req: &WatchGlobalConfigRequest, + ) -> Option> { + unimplemented!() } fn get_members(&self, _: &GetMembersRequest) -> Option> { @@ -95,6 +152,10 @@ pub trait PdMocker { None } + fn report_buckets(&self, _: &ReportBucketsRequest) -> Option> { + None + } + fn get_region(&self, _: &GetRegionRequest) -> Option> { None } diff --git a/components/test_pd/src/mocker/retry.rs b/components/test_pd/src/mocker/retry.rs index ef49aee3f66..be9c90633c0 100644 --- a/components/test_pd/src/mocker/retry.rs +++ b/components/test_pd/src/mocker/retry.rs @@ -87,11 +87,9 @@ impl Default for NotRetry { impl PdMocker for NotRetry { fn get_region_by_id(&self, _: &GetRegionByIdRequest) -> Option> { if !self.is_visited.swap(true, Ordering::Relaxed) { - info!( - "[NotRetry] get_region_by_id returns Ok(_) with header has IncompatibleVersion error" - ); + info!("[NotRetry] get_region_by_id returns Ok(_) with header has RegionNotFound error"); let mut err = Error::default(); - err.set_type(ErrorType::IncompatibleVersion); + err.set_type(ErrorType::RegionNotFound); let mut resp = GetRegionResponse::default(); resp.mut_header().set_error(err); Some(Ok(resp)) @@ -103,11 +101,9 @@ impl PdMocker for NotRetry { fn get_store(&self, _: &GetStoreRequest) -> Option> { if !self.is_visited.swap(true, Ordering::Relaxed) { - info!( - "[NotRetry] get_region_by_id returns Ok(_) with header has IncompatibleVersion error" - ); + info!("[NotRetry] get_region_by_id returns Ok(_) with header has Unknown error"); let mut err = Error::default(); - err.set_type(ErrorType::IncompatibleVersion); + err.set_type(ErrorType::Unknown); let mut resp = GetStoreResponse::default(); resp.mut_header().set_error(err); Some(Ok(resp)) diff --git a/components/test_pd/src/mocker/service.rs b/components/test_pd/src/mocker/service.rs index 95ffde14b7c..330a5375fb2 100644 --- a/components/test_pd/src/mocker/service.rs +++ b/components/test_pd/src/mocker/service.rs @@ -8,7 +8,7 @@ use std::sync::{ use collections::HashMap; use fail::fail_point; use kvproto::{ - metapb::{Peer, Region, Store, StoreState}, + metapb::{Buckets, Peer, Region, Store, StoreState}, pdpb::*, }; @@ -19,8 +19,9 @@ pub struct Service { id_allocator: AtomicUsize, members_resp: Mutex>, is_bootstrapped: AtomicBool, - stores: Mutex>, + stores: Mutex>, regions: Mutex>, + buckets: Mutex>, leaders: Mutex>, feature_gate: Mutex, } @@ -35,6 +36,7 @@ impl Service { regions: Mutex::new(HashMap::default()), leaders: Mutex::new(HashMap::default()), feature_gate: Mutex::new(String::default()), + buckets: Mutex::new(HashMap::default()), } } @@ -47,7 +49,10 @@ impl Service { /// Add an arbitrary store. pub fn add_store(&self, store: Store) { let store_id = store.get_id(); - self.stores.lock().unwrap().insert(store_id, store); + self.stores + .lock() + .unwrap() + .insert(store_id, (store, StoreStats::new())); } pub fn set_cluster_version(&self, version: String) { @@ -96,7 +101,7 @@ impl PdMocker for Service { if self.is_bootstrapped.load(Ordering::SeqCst) { let mut err = Error::default(); - err.set_type(ErrorType::Unknown); + err.set_type(ErrorType::AlreadyBootstrapped); err.set_message("cluster is already bootstrapped".to_owned()); header.set_error(err); resp.set_header(header); @@ -107,7 +112,7 @@ impl PdMocker for Service { self.stores .lock() .unwrap() - .insert(store.get_id(), store.clone()); + .insert(store.get_id(), (store.clone(), StoreStats::new())); self.regions .lock() .unwrap() @@ -138,9 +143,10 @@ impl PdMocker for Service { let mut resp = GetStoreResponse::default(); let stores = self.stores.lock().unwrap(); match stores.get(&req.get_store_id()) { - Some(store) => { + Some((store, stats)) => { resp.set_header(Service::header()); resp.set_store(store.clone()); + resp.set_stats(stats.clone()); Some(Ok(resp)) } None => { @@ -160,7 +166,7 @@ impl PdMocker for Service { resp.set_header(Service::header()); let exclude_tombstone = req.get_exclude_tombstone_stores(); let stores = self.stores.lock().unwrap(); - for store in stores.values() { + for (store, _) in stores.values() { if exclude_tombstone && store.get_state() == StoreState::Tombstone { continue; } @@ -206,6 +212,9 @@ impl PdMocker for Service { Some(region) => { resp.set_header(Service::header()); resp.set_region(region.clone()); + if let Some(bucket) = self.buckets.lock().unwrap().get(&req.get_region_id()) { + resp.set_buckets(bucket.clone()); + } if let Some(leader) = leaders.get(®ion.get_id()) { resp.set_leader(leader.clone()); } @@ -223,6 +232,16 @@ impl PdMocker for Service { } } + fn report_buckets(&self, req: &ReportBucketsRequest) -> Option> { + let buckets = req.get_buckets(); + let region_id = req.get_buckets().get_region_id(); + self.buckets + .lock() + .unwrap() + .insert(region_id, buckets.clone()); + None + } + fn region_heartbeat( &self, req: &RegionHeartbeatRequest, @@ -238,16 +257,28 @@ impl PdMocker for Service { .insert(region_id, req.get_leader().clone()); let mut resp = RegionHeartbeatResponse::default(); + resp.set_region_id(req.get_region().get_id()); let header = Service::header(); resp.set_header(header); Some(Ok(resp)) } - fn store_heartbeat(&self, _: &StoreHeartbeatRequest) -> Option> { + fn store_heartbeat( + &self, + req: &StoreHeartbeatRequest, + ) -> Option> { let mut resp = StoreHeartbeatResponse::default(); let header = Service::header(); resp.set_header(header); resp.set_cluster_version(self.feature_gate.lock().unwrap().to_owned()); + if let Some((_, stats)) = self + .stores + .lock() + .unwrap() + .get_mut(&req.get_stats().get_store_id()) + { + *stats = req.get_stats().clone(); + } Some(Ok(resp)) } diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 79b095ef0d9..1662e27f00f 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{ atomic::{AtomicI64, Ordering}, Arc, @@ -12,14 +13,18 @@ use std::{ use fail::fail_point; use futures::{future, SinkExt, TryFutureExt, TryStreamExt}; use grpcio::{ - DuplexSink, EnvBuilder, RequestStream, RpcContext, RpcStatus, RpcStatusCode, - Server as GrpcServer, ServerBuilder, ServerStreamingSink, UnarySink, WriteFlags, + ClientStreamingSink, DuplexSink, EnvBuilder, RequestStream, RpcContext, RpcStatus, + RpcStatusCode, Server as GrpcServer, ServerBuilder, ServerStreamingSink, UnarySink, WriteFlags, +}; +use kvproto::{ + meta_storagepb_grpc::{create_meta_storage, MetaStorage}, + pdpb::*, }; -use kvproto::pdpb::*; use pd_client::Error as PdError; use security::*; use super::mocker::*; +use crate::mocker::etcd::{EtcdClient, Keys, KvEventType, MetaKey}; pub struct Server { server: Option, @@ -57,6 +62,7 @@ impl Server { default_handler, case, tso_logical: Arc::new(AtomicI64::default()), + etcd_client: EtcdClient::default(), }; let mut server = Server { server: None, @@ -67,14 +73,17 @@ impl Server { } pub fn start(&mut self, mgr: &SecurityManager, eps: Vec<(String, u16)>) { - let service = create_pd(self.mocker.clone()); + let pd = create_pd(self.mocker.clone()); + let meta_store = create_meta_storage(self.mocker.clone()); let env = Arc::new( EnvBuilder::new() .cq_count(1) .name_prefix(thd_name!("mock-server")) .build(), ); - let mut sb = ServerBuilder::new(env).register_service(service); + let mut sb = ServerBuilder::new(env) + .register_service(pd) + .register_service(meta_store); for (host, port) in eps { sb = mgr.bind(sb, &host, port); } @@ -170,6 +179,7 @@ struct PdMock { default_handler: Arc, case: Option>, tso_logical: Arc, + etcd_client: EtcdClient, } impl Clone for PdMock { @@ -178,10 +188,45 @@ impl Clone for PdMock { default_handler: Arc::clone(&self.default_handler), case: self.case.clone(), tso_logical: self.tso_logical.clone(), + etcd_client: self.etcd_client.clone(), } } } +impl MetaStorage for PdMock { + fn watch( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: kvproto::meta_storagepb::WatchRequest, + sink: grpcio::ServerStreamingSink, + ) { + match &self.case { + Some(x) => { + x.meta_store_watch(req, sink, &ctx); + } + None => grpcio::unimplemented_call!(ctx, sink), + } + } + + fn get( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: kvproto::meta_storagepb::GetRequest, + sink: grpcio::UnarySink, + ) { + hijack_unary(self, ctx, sink, |m| m.meta_store_get(req.clone())) + } + + fn put( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: kvproto::meta_storagepb::PutRequest, + sink: grpcio::UnarySink, + ) { + hijack_unary(self, ctx, sink, |m| m.meta_store_put(req.clone())) + } +} + impl Pd for PdMock { fn load_global_config( &mut self, @@ -189,39 +234,71 @@ impl Pd for PdMock { req: LoadGlobalConfigRequest, sink: UnarySink, ) { - hijack_unary(self, ctx, sink, |c| c.load_global_config(&req)) + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| c.load_global_config(&req, cli.clone())) } fn store_global_config( &mut self, - _ctx: RpcContext<'_>, - _req: StoreGlobalConfigRequest, - _sink: UnarySink, + ctx: RpcContext<'_>, + req: StoreGlobalConfigRequest, + sink: UnarySink, ) { - unimplemented!() + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| { + c.store_global_config(&req, cli.clone()) + }) } fn watch_global_config( &mut self, ctx: RpcContext<'_>, - _req: WatchGlobalConfigRequest, + req: WatchGlobalConfigRequest, mut sink: ServerStreamingSink, ) { - ctx.spawn(async move { - let mut name: usize = 0; - loop { + let cli = self.etcd_client.clone(); + let future = async move { + let mut watcher = match cli + .lock() + .await + .watch( + Keys::Range(MetaKey(b"".to_vec()), MetaKey(b"\xff".to_vec())), + req.revision, + ) + .await + { + Ok(w) => w, + Err(err) => { + error!("failed to watch: {:?}", err); + return; + } + }; + + while let Some(event) = watcher.as_mut().recv().await { + info!("watch event from etcd"; "event" => ?event); let mut change = GlobalConfigItem::new(); - change.set_name(format!("/global/config/{:?}", name).to_owned()); - change.set_value(format!("{:?}", name)); + change.set_kind(match event.kind { + KvEventType::Put => EventType::Put, + KvEventType::Delete => EventType::Delete, + }); + change.set_name(from_utf8(event.pair.key()).unwrap().to_string()); + change.set_payload(event.pair.value().into()); let mut wc = WatchGlobalConfigResponse::default(); wc.set_changes(vec![change].into()); - // simulate network delay - std::thread::sleep(Duration::from_millis(10)); - name += 1; let _ = sink.send((wc, WriteFlags::default())).await; let _ = sink.flush().await; + #[cfg(feature = "failpoints")] + { + use futures::executor::block_on; + let cli_clone = cli.clone(); + fail_point!("watch_global_config_return", |_| { + block_on(async move { cli_clone.lock().await.clear_subs() }); + watcher.close(); + }); + } } - }) + }; + ctx.spawn(future); } fn get_members( @@ -242,18 +319,19 @@ impl Pd for PdMock { let header = Service::header(); let tso_logical = self.tso_logical.clone(); let fut = async move { - resp.send_all(&mut req.map_ok(move |r| { - let logical = - tso_logical.fetch_add(r.count as i64, Ordering::SeqCst) + r.count as i64; - let mut res = TsoResponse::default(); - res.set_header(header.clone()); - res.mut_timestamp().physical = 42; - res.mut_timestamp().logical = logical; - res.count = r.count; - (res, WriteFlags::default()) - })) - .await - .unwrap(); + // Tolerate errors like RpcFinished(None). + let _ = resp + .send_all(&mut req.map_ok(move |r| { + let logical = + tso_logical.fetch_add(r.count as i64, Ordering::SeqCst) + r.count as i64; + let mut res = TsoResponse::default(); + res.set_header(header.clone()); + res.mut_timestamp().physical = 42; + res.mut_timestamp().logical = logical; + res.count = r.count; + (res, WriteFlags::default()) + })) + .await; let _ = resp.close().await; }; ctx.spawn(fut); @@ -322,6 +400,29 @@ impl Pd for PdMock { hijack_unary(self, ctx, sink, |c| c.store_heartbeat(&req)) } + fn report_buckets( + &mut self, + ctx: grpcio::RpcContext<'_>, + stream: RequestStream, + sink: ClientStreamingSink, + ) { + let mock = self.clone(); + ctx.spawn(async move { + let mut stream = stream.map_err(PdError::from); + while let Ok(Some(req)) = stream.try_next().await { + let resp = mock + .case + .as_ref() + .and_then(|case| case.report_buckets(&req)) + .or_else(|| mock.default_handler.report_buckets(&req)); + if let Some(Ok(resp)) = resp { + sink.success(resp); + break; + } + } + }); + } + fn region_heartbeat( &mut self, ctx: RpcContext<'_>, diff --git a/components/test_pd/src/util.rs b/components/test_pd/src/util.rs index 1b05196c346..b1a22b93c47 100644 --- a/components/test_pd/src/util.rs +++ b/components/test_pd/src/util.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -use pd_client::{Config, RpcClient}; +use pd_client::{Config, RpcClient, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use tikv_util::config::ReadableDuration; @@ -23,6 +23,13 @@ pub fn new_client(eps: Vec<(String, u16)>, mgr: Option>) -> RpcClient::new(&cfg, None, mgr).unwrap() } +pub fn new_client_v2(eps: Vec<(String, u16)>, mgr: Option>) -> RpcClientV2 { + let cfg = new_config(eps); + let mgr = + mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); + RpcClientV2::new(&cfg, None, mgr).unwrap() +} + pub fn new_client_with_update_interval( eps: Vec<(String, u16)>, mgr: Option>, @@ -34,3 +41,15 @@ pub fn new_client_with_update_interval( mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); RpcClient::new(&cfg, None, mgr).unwrap() } + +pub fn new_client_v2_with_update_interval( + eps: Vec<(String, u16)>, + mgr: Option>, + interval: ReadableDuration, +) -> RpcClientV2 { + let mut cfg = new_config(eps); + cfg.update_interval = interval; + let mgr = + mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); + RpcClientV2::new(&cfg, None, mgr).unwrap() +} diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml new file mode 100644 index 00000000000..3b002970236 --- /dev/null +++ b/components/test_pd_client/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "test_pd_client" +version = "0.0.1" +edition = "2018" +publish = false + +[dependencies] +collections = { workspace = true } +fail = "0.5" +futures = "0.3" +grpcio = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } +log_wrappers = { workspace = true } +pd_client = { workspace = true } +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } +slog = { workspace = true } +slog-global = { workspace = true } +tikv_util = { workspace = true } +tokio = { version = "1.5", features = ["rt-multi-thread"] } +tokio-timer = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_pd_client/src/lib.rs b/components/test_pd_client/src/lib.rs new file mode 100644 index 00000000000..9ea837e335e --- /dev/null +++ b/components/test_pd_client/src/lib.rs @@ -0,0 +1,8 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#[macro_use] +extern crate tikv_util; + +mod pd; + +pub use crate::pd::*; diff --git a/components/test_raftstore/src/pd.rs b/components/test_pd_client/src/pd.rs similarity index 88% rename from components/test_raftstore/src/pd.rs rename to components/test_pd_client/src/pd.rs index 66823a29708..1c2cc573eb9 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -26,7 +26,10 @@ use futures::{ use keys::{self, data_key, enc_end_key, enc_start_key}; use kvproto::{ metapb::{self, PeerRole}, - pdpb, + pdpb::{ + self, BatchSwitchWitness, ChangePeer, ChangePeerV2, CheckPolicy, Merge, + RegionHeartbeatResponse, SplitRegion, SwitchWitness, TransferLeader, + }, replication_modepb::{ DrAutoSyncState, RegionReplicationStatus, ReplicationMode, ReplicationStatus, StoreDrAutoSyncStatus, @@ -36,20 +39,20 @@ use pd_client::{ BucketStat, Error, FeatureGate, Key, PdClient, PdFuture, RegionInfo, RegionStat, Result, }; use raft::eraftpb::ConfChangeType; -use raftstore::store::{ - util::{check_key_in_region, find_peer, is_learner}, - QueryStats, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, -}; use tikv_util::{ + store::{check_key_in_region, find_peer, find_peer_by_id, is_learner, new_peer, QueryStats}, time::{Instant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, Either, HandyRwLock, }; use tokio_timer::timer::Handle; -use txn_types::TimeStamp; +use txn_types::{TimeStamp, TSO_PHYSICAL_SHIFT_BITS}; use super::*; +pub const INIT_EPOCH_CONF_VER: u64 = 1; +pub const INIT_EPOCH_VER: u64 = 1; + struct Store { store: metapb::Store, region_ids: HashSet, @@ -132,6 +135,15 @@ enum Operator { remove_peers: Vec, policy: SchedulePolicy, }, + BatchSwitchWitness { + peer_ids: Vec, + is_witnesses: Vec, + policy: SchedulePolicy, + }, +} + +pub fn sleep_ms(ms: u64) { + std::thread::sleep(Duration::from_millis(ms)); } fn change_peer(change_type: ConfChangeType, peer: metapb::Peer) -> pdpb::ChangePeer { @@ -141,6 +153,75 @@ fn change_peer(change_type: ConfChangeType, peer: metapb::Peer) -> pdpb::ChangeP cp } +pub fn new_pd_change_peer( + change_type: ConfChangeType, + peer: metapb::Peer, +) -> RegionHeartbeatResponse { + let mut change_peer = ChangePeer::default(); + change_peer.set_change_type(change_type); + change_peer.set_peer(peer); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_change_peer(change_peer); + resp +} + +pub fn new_pd_change_peer_v2(changes: Vec) -> RegionHeartbeatResponse { + let mut change_peer = ChangePeerV2::default(); + change_peer.set_changes(changes.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_change_peer_v2(change_peer); + resp +} + +pub fn new_split_region(policy: CheckPolicy, keys: Vec>) -> RegionHeartbeatResponse { + let mut split_region = SplitRegion::default(); + split_region.set_policy(policy); + split_region.set_keys(keys.into()); + let mut resp = RegionHeartbeatResponse::default(); + resp.set_split_region(split_region); + resp +} + +pub fn new_pd_transfer_leader( + peer: metapb::Peer, + peers: Vec, +) -> RegionHeartbeatResponse { + let mut transfer_leader = TransferLeader::default(); + transfer_leader.set_peer(peer); + transfer_leader.set_peers(peers.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_transfer_leader(transfer_leader); + resp +} + +pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResponse { + let mut merge = Merge::default(); + merge.set_target(target_region); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_merge(merge); + resp +} + +fn switch_witness(peer_id: u64, is_witness: bool) -> SwitchWitness { + let mut sw = SwitchWitness::default(); + sw.set_peer_id(peer_id); + sw.set_is_witness(is_witness); + sw +} + +pub fn new_pd_batch_switch_witnesses(switches: Vec) -> RegionHeartbeatResponse { + let mut switch_witnesses = BatchSwitchWitness::default(); + switch_witnesses.set_switch_witnesses(switches.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_switch_witnesses(switch_witnesses); + resp +} + impl Operator { fn make_region_heartbeat_response( &self, @@ -155,13 +236,13 @@ impl Operator { } else { ConfChangeType::AddNode }; - new_pd_change_peer(conf_change_type, peer.clone()) + new_pd_change_peer_v2(vec![change_peer(conf_change_type, peer.clone())]) } else { pdpb::RegionHeartbeatResponse::default() } } Operator::RemovePeer { ref peer, .. } => { - new_pd_change_peer(ConfChangeType::RemoveNode, peer.clone()) + new_pd_change_peer_v2(vec![change_peer(ConfChangeType::RemoveNode, peer.clone())]) } Operator::TransferLeader { ref peer, @@ -216,6 +297,17 @@ impl Operator { } new_pd_change_peer_v2(cps) } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + .. + } => { + let mut switches = Vec::with_capacity(peer_ids.len()); + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + switches.push(switch_witness(*peer_id, *is_witness)); + } + new_pd_batch_switch_witnesses(switches) + } } } @@ -300,6 +392,26 @@ impl Operator { add && remove || !policy.schedule() } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + ref mut policy, + } => { + if !policy.schedule() { + return true; + } + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + if region + .get_peers() + .iter() + .any(|p| (p.get_id() == *peer_id) && (p.get_is_witness() != *is_witness)) + || cluster.pending_peers.contains_key(peer_id) + { + return false; + } + } + true + } } } } @@ -327,6 +439,7 @@ struct PdCluster { // region id -> leader leaders: HashMap, down_peers: HashMap, + // peer id -> peer pending_peers: HashMap, is_bootstraped: bool, @@ -410,9 +523,9 @@ impl PdCluster { fn put_store(&mut self, store: metapb::Store) -> Result<()> { let store_id = store.get_id(); - // There is a race between put_store and handle_region_heartbeat_response. If store id is - // 0, it means it's a placeholder created by latter, we just need to update the meta. - // Otherwise we should overwrite it. + // There is a race between put_store and handle_region_heartbeat_response. If + // store id is 0, it means it's a placeholder created by latter, we just need to + // update the meta. Otherwise we should overwrite it. if self .stores .get(&store_id) @@ -538,8 +651,8 @@ impl PdCluster { && incoming_epoch.get_conf_ver() == 0; let overlaps = self.get_overlap(start_key, end_key); if created_by_unsafe_recovery { - // Allow recreated region by unsafe recover to overwrite other regions with a "older" - // epoch. + // Allow recreated region by unsafe recover to overwrite other regions with a + // "older" epoch. return Ok(overlaps); } for r in overlaps.iter() { @@ -811,7 +924,7 @@ pub struct TestPdClient { pub gc_safepoints: RwLock>, } -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct GcSafePoint { pub serivce: String, pub ttl: Duration, @@ -983,6 +1096,48 @@ impl TestPdClient { panic!("region {:?} failed to leave joint", region); } + pub fn must_finish_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + for _ in 1..500 { + sleep_ms(10); + let region = match block_on(self.get_region_by_id(region_id)).unwrap() { + Some(region) => region, + None => continue, + }; + + for p in region.get_peers().iter() { + error!("in must_finish_switch_witnesses, p: {:?}", p); + } + + let mut need_retry = false; + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + match find_peer_by_id(®ion, *peer_id) { + Some(p) => { + if p.get_is_witness() != *is_witness + || self.cluster.rl().pending_peers.contains_key(&p.get_id()) + { + need_retry = true; + break; + } + } + None => { + need_retry = true; + break; + } + } + } + if !need_retry { + return; + } + } + let region = block_on(self.get_region_by_id(region_id)).unwrap(); + panic!("region {:?} failed to finish switch witnesses", region); + } + pub fn add_region(&self, region: &metapb::Region) { self.cluster.wl().add_region(region) } @@ -1012,6 +1167,15 @@ impl TestPdClient { self.schedule_operator(region_id, op); } + pub fn switch_witnesses(&self, region_id: u64, peer_ids: Vec, is_witnesses: Vec) { + let op = Operator::BatchSwitchWitness { + peer_ids, + is_witnesses, + policy: SchedulePolicy::TillSuccess, + }; + self.schedule_operator(region_id, op); + } + pub fn joint_confchange( &self, region_id: u64, @@ -1129,6 +1293,16 @@ impl TestPdClient { self.must_none_peer(region_id, peer); } + pub fn must_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + self.switch_witnesses(region_id, peer_ids.clone(), is_witnesses.clone()); + self.must_finish_switch_witnesses(region_id, peer_ids, is_witnesses); + } + pub fn must_joint_confchange( &self, region_id: u64, @@ -1318,7 +1492,8 @@ impl TestPdClient { self.cluster.wl().check_merge_target_integrity = false; } - /// The next generated TSO will be `ts + 1`. See `get_tso()` and `batch_get_tso()`. + /// The next generated TSO will be `ts + 1`. See `get_tso()` and + /// `batch_get_tso()`. pub fn set_tso(&self, ts: TimeStamp) { let old = self.tso.swap(ts.into_inner(), Ordering::SeqCst); if old > ts.into_inner() { @@ -1407,7 +1582,7 @@ impl PdClient for TestPdClient { for _ in 1..500 { sleep_ms(10); if let Some(region) = self.cluster.rl().get_region(data_key(key)) { - if check_key_in_region(key, ®ion).is_ok() { + if check_key_in_region(key, ®ion) { return Ok(region); } } @@ -1697,8 +1872,38 @@ impl PdClient for TestPdClient { )), ))); } - let tso = self.tso.fetch_add(count as u64, Ordering::SeqCst); - Box::pin(ok(TimeStamp::new(tso + count as u64))) + + assert!(count > 0); + assert!(count < (1 << TSO_PHYSICAL_SHIFT_BITS)); + + let mut old_tso = self.tso.load(Ordering::SeqCst); + loop { + let ts: TimeStamp = old_tso.into(); + + // Add to logical part first. + let (mut physical, mut logical) = (ts.physical(), ts.logical() + count as u64); + + // When logical part is overflow, add to physical part. + // Moreover, logical part must not less than `count-1`, as the + // generated batch of TSO is treated as of the same physical time. + // Refer to real PD's implementation: + // https://github.com/tikv/pd/blob/v6.2.0/server/tso/tso.go#L361 + if logical >= (1 << TSO_PHYSICAL_SHIFT_BITS) { + physical += 1; + logical = (count - 1) as u64; + } + + let new_tso = TimeStamp::compose(physical, logical); + match self.tso.compare_exchange_weak( + old_tso, + new_tso.into_inner(), + Ordering::SeqCst, + Ordering::SeqCst, + ) { + Ok(_) => return Box::pin(ok(new_tso)), + Err(x) => old_tso = x, + } + } } fn update_service_safe_point( @@ -1742,13 +1947,7 @@ impl PdClient for TestPdClient { if current.meta < buckets.meta { std::mem::swap(current, &mut buckets); } - - pd_client::merge_bucket_stats( - ¤t.meta.keys, - &mut current.stats, - &buckets.meta.keys, - &buckets.stats, - ); + current.merge(&buckets); }) .or_insert(buckets); ready(Ok(())).boxed() diff --git a/components/test_raftstore-v2/Cargo.toml b/components/test_raftstore-v2/Cargo.toml new file mode 100644 index 00000000000..5c6297c124d --- /dev/null +++ b/components/test_raftstore-v2/Cargo.toml @@ -0,0 +1,68 @@ +[package] +name = "test_raftstore-v2" +version = "0.0.1" +edition = "2018" +publish = false + +[features] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +cloud-aws = ["encryption_export/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure"] +test-engine-kv-rocksdb = [ + "raftstore/test-engine-kv-rocksdb" +] +test-engine-raft-raft-engine = [ + "raftstore/test-engine-raft-raft-engine" +] +test-engines-rocksdb = [ + "raftstore/test-engines-rocksdb", +] +test-engines-panic = [ + "raftstore/test-engines-panic", +] + +[dependencies] +api_version = { workspace = true } +backtrace = "0.3" +causal_ts = { workspace = true, features = ["testexport"] } +collections = { workspace = true } +concurrency_manager = { workspace = true } +crossbeam = "0.8" +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_rocks_helper = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +fail = "0.5" +file_system = { workspace = true } +futures = "0.3" +grpcio = { workspace = true } +grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +keys = { workspace = true } +kvproto = { workspace = true } +lazy_static = "1.3" +log_wrappers = { workspace = true } +pd_client = { workspace = true } +protobuf = { version = "2.8", features = ["bytes"] } +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } +raftstore = { workspace = true, features = ["testexport"] } +raftstore-v2 = { workspace = true, features = ["testexport"] } +rand = "0.8" +resolved_ts = { workspace = true } +resource_control = { workspace = true } +resource_metering = { workspace = true } +security = { workspace = true } +server = { workspace = true } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +# better to not use slog-global, but pass in the logger +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +tempfile = "3.0" +test_pd_client = { workspace = true } +test_raftstore = { workspace = true } +test_util = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +tokio = { version = "1.5", features = ["rt-multi-thread"] } +tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +txn_types = { workspace = true } diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs new file mode 100644 index 00000000000..eafa7a45403 --- /dev/null +++ b/components/test_raftstore-v2/src/cluster.rs @@ -0,0 +1,1819 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::hash_map::Entry as MapEntry, + result, + sync::{Arc, Mutex, RwLock}, + thread, + time::Duration, +}; + +use collections::{HashMap, HashSet}; +use encryption_export::DataKeyManager; +use engine_rocks::{RocksSnapshot, RocksStatistics}; +use engine_test::raft::RaftTestEngine; +use engine_traits::{ + KvEngine, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, ReadOptions, SyncMutable, + TabletRegistry, CF_DEFAULT, +}; +use file_system::IoRateLimiter; +use futures::{compat::Future01CompatExt, executor::block_on, select, Future, FutureExt}; +use keys::{data_key, validate_data_key, DATA_PREFIX_KEY}; +use kvproto::{ + errorpb::Error as PbError, + kvrpcpb::ApiVersion, + metapb::{self, Buckets, PeerRole, RegionEpoch}, + raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RegionDetailResponse, Request, + Response, StatusCmdType, + }, + raft_serverpb::{ + PeerState, RaftApplyState, RaftLocalState, RaftMessage, RaftTruncatedState, + RegionLocalState, StoreIdent, + }, +}; +use pd_client::PdClient; +use raftstore::{ + store::{ + cmd_resp, initial_region, util::check_key_in_region, Bucket, BucketRange, Callback, + RegionSnapshot, TabletSnapManager, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, + }, + Error, Result, +}; +use raftstore_v2::{ + router::{PeerMsg, QueryResult}, + write_initial_states, SimpleWriteEncoder, StoreMeta, StoreRouter, +}; +use resource_control::ResourceGroupManager; +use tempfile::TempDir; +use test_pd_client::TestPdClient; +use test_raftstore::{ + is_error_response, new_admin_request, new_delete_cmd, new_delete_range_cmd, new_get_cf_cmd, + new_peer, new_prepare_merge, new_put_cf_cmd, new_region_detail_cmd, new_region_leader_cmd, + new_request, new_snap_cmd, new_status_request, new_store, new_tikv_config_with_api_ver, + new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, + RawEngine, +}; +use tikv::server::Result as ServerResult; +use tikv_util::{ + box_err, box_try, debug, error, safe_panic, + thread_group::GroupProperties, + time::{Instant, ThreadReadId}, + timer::GLOBAL_TIMER_HANDLE, + warn, + worker::LazyWorker, + HandyRwLock, +}; + +// We simulate 3 or 5 nodes, each has a store. +// Sometimes, we use fixed id to test, which means the id +// isn't allocated by pd, and node id, store id are same. +// E,g, for node 1, the node id and store id are both 1. +pub trait Simulator { + // Pass 0 to let pd allocate a node id if db is empty. + // If node id > 0, the node must be created in db already, + // and the node id must be the same as given argument. + // Return the node id. + // TODO: we will rename node name here because now we use store only. + fn run_node( + &mut self, + node_id: u64, + cfg: Config, + store_meta: Arc>>, + key_mgr: Option>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + resource_manager: &Option>, + ) -> ServerResult; + + fn stop_node(&mut self, node_id: u64); + fn get_node_ids(&self) -> HashSet; + + fn add_send_filter(&mut self, node_id: u64, filter: Box); + fn clear_send_filters(&mut self, node_id: u64); + + fn add_recv_filter(&mut self, node_id: u64, filter: Box); + fn clear_recv_filters(&mut self, node_id: u64); + + fn get_router(&self, node_id: u64) -> Option>; + fn get_snap_dir(&self, node_id: u64) -> String; + fn get_snap_mgr(&self, node_id: u64) -> &TabletSnapManager; + fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; + + fn read(&mut self, request: RaftCmdRequest, timeout: Duration) -> Result { + let timeout_f = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + timeout) + .compat(); + futures::executor::block_on(async move { + futures::select! { + res = self.async_read(request).fuse() => res, + e = timeout_f.fuse() => { + Err(Error::Timeout(format!("request timeout for {:?}: {:?}", timeout,e))) + }, + } + }) + } + + fn async_read( + &mut self, + request: RaftCmdRequest, + ) -> impl Future> + Send { + let mut req_clone = request.clone(); + req_clone.clear_requests(); + req_clone.mut_requests().push(new_snap_cmd()); + let snap = self.async_snapshot(req_clone); + async move { + match snap.await { + Ok(snap) => { + let requests = request.get_requests(); + let mut response = RaftCmdResponse::default(); + let mut responses = Vec::with_capacity(requests.len()); + for req in requests { + let cmd_type = req.get_cmd_type(); + match cmd_type { + CmdType::Get => { + let mut resp = Response::default(); + let key = req.get_get().get_key(); + let cf = req.get_get().get_cf(); + let region = snap.get_region(); + + if let Err(e) = check_key_in_region(key, region) { + return Ok(cmd_resp::new_error(e)); + } + + let res = if cf.is_empty() { + snap.get_value(key).unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {} with cf {}: {:?}", + snap.get_region().get_id(), + log_wrappers::Value::key(key), + cf, + e + ) + }) + } else { + snap.get_value_cf(cf, key).unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {}: {:?}", + snap.get_region().get_id(), + log_wrappers::Value::key(key), + e + ) + }) + }; + if let Some(res) = res { + resp.mut_get().set_value(res.to_vec()); + } + resp.set_cmd_type(cmd_type); + responses.push(resp); + } + _ => unimplemented!(), + } + } + response.set_responses(responses.into()); + + Ok(response) + } + Err(e) => Ok(e), + } + } + } + + fn async_snapshot( + &mut self, + request: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send; + + fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()>; + + fn call_query(&self, request: RaftCmdRequest, timeout: Duration) -> Result { + let node_id = request.get_header().get_peer().get_store_id(); + self.call_query_on_node(node_id, request, timeout) + } + + fn call_query_on_node( + &self, + node_id: u64, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let region_id = request.get_header().get_region_id(); + let (msg, sub) = PeerMsg::raft_query(request); + match self.async_peer_msg_on_node(node_id, region_id, msg) { + Ok(()) => {} + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e.into()); + return Ok(resp); + } + } + + let timeout_f = GLOBAL_TIMER_HANDLE.delay(std::time::Instant::now() + timeout); + // todo: unwrap? + match block_on(async move { + select! { + res = sub.result().fuse() => Ok(res.unwrap()), + _ = timeout_f.compat().fuse() => Err(Error::Timeout(format!("request timeout for {:?}", timeout))), + + } + }).unwrap() { + QueryResult::Read(_) => unreachable!(), + QueryResult::Response(resp) => Ok(resp), + } + } + + fn call_command(&self, request: RaftCmdRequest, timeout: Duration) -> Result { + let node_id = request.get_header().get_peer().get_store_id(); + self.call_command_on_node(node_id, request, timeout) + } + + fn call_command_on_node( + &self, + node_id: u64, + mut request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let region_id = request.get_header().get_region_id(); + + let (msg, sub) = if request.has_admin_request() { + PeerMsg::admin_command(request) + } else { + let requests = request.get_requests(); + let mut write_encoder = SimpleWriteEncoder::with_capacity(64); + for req in requests { + match req.get_cmd_type() { + CmdType::Put => { + let put = req.get_put(); + write_encoder.put(put.get_cf(), put.get_key(), put.get_value()); + } + CmdType::Delete => { + let delete = req.get_delete(); + write_encoder.delete(delete.get_cf(), delete.get_key()); + } + CmdType::DeleteRange => { + unimplemented!() + } + _ => unreachable!(), + } + } + PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + }; + + match self.async_peer_msg_on_node(node_id, region_id, msg) { + Ok(()) => {} + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e.into()); + return Ok(resp); + } + } + + let timeout_f = GLOBAL_TIMER_HANDLE.delay(std::time::Instant::now() + timeout); + block_on(async move { + select! { + // todo: unwrap? + res = sub.result().fuse() => Ok(res.unwrap()), + _ = timeout_f.compat().fuse() => Err(Error::Timeout(format!("request timeout for {:?}", timeout))), + } + }) + } + + fn async_command_on_node(&self, node_id: u64, mut request: RaftCmdRequest) { + let region_id = request.get_header().get_region_id(); + + let (msg, _sub) = if request.has_admin_request() { + PeerMsg::admin_command(request) + } else { + let requests = request.get_requests(); + let mut write_encoder = SimpleWriteEncoder::with_capacity(64); + for req in requests { + match req.get_cmd_type() { + CmdType::Put => { + let put = req.get_put(); + write_encoder.put(put.get_cf(), put.get_key(), put.get_value()); + } + CmdType::Delete => { + let delete = req.get_delete(); + write_encoder.delete(delete.get_cf(), delete.get_key()); + } + CmdType::DeleteRange => { + unimplemented!() + } + _ => unreachable!(), + } + } + PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + }; + + self.async_peer_msg_on_node(node_id, region_id, msg) + .unwrap(); + } +} + +pub struct Cluster, EK: KvEngine> { + pub cfg: Config, + leaders: HashMap, + pub count: usize, + + pub paths: Vec, + pub engines: Vec<(TabletRegistry, RaftTestEngine)>, + pub tablet_registries: HashMap>, + pub raft_engines: HashMap, + pub store_metas: HashMap>>>, + key_managers: Vec>>, + pub io_rate_limiter: Option>, + key_managers_map: HashMap>>, + group_props: HashMap, + pub sst_workers: Vec>, + pub sst_workers_map: HashMap, + pub kv_statistics: Vec>, + pub raft_statistics: Vec>>, + pub sim: Arc>, + pub pd_client: Arc, + resource_manager: Option>, + pub engine_creator: Box< + dyn Fn( + Option<(u64, u64)>, + Option>, + &Config, + ) -> ( + TabletRegistry, + RaftTestEngine, + Option>, + TempDir, + LazyWorker, + Arc, + Option>, + ), + >, +} + +impl, EK: KvEngine> Cluster { + pub fn new( + id: u64, + count: usize, + sim: Arc>, + pd_client: Arc, + api_version: ApiVersion, + engine_creator: Box< + dyn Fn( + Option<(u64, u64)>, + Option>, + &Config, + ) -> ( + TabletRegistry, + RaftTestEngine, + Option>, + TempDir, + LazyWorker, + Arc, + Option>, + ), + >, + ) -> Cluster { + Cluster { + cfg: Config { + tikv: new_tikv_config_with_api_ver(id, api_version), + prefer_mem: true, + }, + count, + tablet_registries: HashMap::default(), + key_managers_map: HashMap::default(), + group_props: HashMap::default(), + raft_engines: HashMap::default(), + store_metas: HashMap::default(), + leaders: HashMap::default(), + kv_statistics: vec![], + raft_statistics: vec![], + sst_workers: vec![], + sst_workers_map: HashMap::default(), + paths: vec![], + engines: vec![], + key_managers: vec![], + io_rate_limiter: None, + resource_manager: Some(Arc::new(ResourceGroupManager::default())), + sim, + pd_client, + engine_creator, + } + } + + pub fn id(&self) -> u64 { + self.cfg.server.cluster_id + } + + pub fn flush_data(&self) { + for reg in self.tablet_registries.values() { + reg.for_each_opened_tablet(|_, cached| -> bool { + if let Some(tablet) = cached.latest() { + tablet.flush_cf(CF_DEFAULT, true /* sync */).unwrap(); + } + true + }); + } + } + + // Bootstrap the store with fixed ID (like 1, 2, .. 5) and + // initialize first region in all stores, then start the cluster. + pub fn run(&mut self) { + self.create_engines(); + self.bootstrap_region().unwrap(); + self.start().unwrap(); + } + + // Bootstrap the store with fixed ID (like 1, 2, .. 5) and + // initialize first region in store 1, then start the cluster. + pub fn run_conf_change(&mut self) -> u64 { + self.create_engines(); + let region_id = self.bootstrap_conf_change(); + self.start().unwrap(); + region_id + } + + pub fn create_engines(&mut self) { + self.io_rate_limiter = Some(Arc::new( + self.cfg + .storage + .io_rate_limit + .build(true /* enable_statistics */), + )); + for id in 1..self.count + 1 { + self.create_engine(Some((self.id(), id as u64))); + } + } + + // id indicates cluster id store_id + fn create_engine(&mut self, id: Option<(u64, u64)>) { + let (reg, raft_engine, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = + (self.engine_creator)(id, self.io_rate_limiter.clone(), &self.cfg); + self.engines.push((reg, raft_engine)); + self.key_managers.push(key_manager); + self.paths.push(dir); + self.sst_workers.push(sst_worker); + self.kv_statistics.push(kv_statistics); + self.raft_statistics.push(raft_statistics); + } + + pub fn start(&mut self) -> ServerResult<()> { + if self.cfg.raft_store.store_io_pool_size == 0 { + // v2 always use async write. + self.cfg.raft_store.store_io_pool_size = 1; + } + + let node_ids: Vec = self.tablet_registries.iter().map(|(&id, _)| id).collect(); + for node_id in node_ids { + self.run_node(node_id)?; + } + + // Try start new nodes. + for id in self.raft_engines.len()..self.count { + let id = id as u64 + 1; + self.create_engine(Some((self.id(), id))); + let (tablet_registry, raft_engine) = self.engines.last().unwrap().clone(); + + let key_mgr = self.key_managers.last().unwrap().clone(); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(id))); + + let props = GroupProperties::default(); + tikv_util::thread_group::set_properties(Some(props.clone())); + + // todo: GroupProperties + let mut sim = self.sim.wl(); + let node_id = sim.run_node( + id, + self.cfg.clone(), + store_meta.clone(), + key_mgr.clone(), + raft_engine.clone(), + tablet_registry.clone(), + &self.resource_manager, + )?; + assert_eq!(id, node_id); + self.group_props.insert(node_id, props); + self.raft_engines.insert(node_id, raft_engine.clone()); + self.tablet_registries + .insert(node_id, tablet_registry.clone()); + self.store_metas.insert(node_id, store_meta); + self.key_managers_map.insert(node_id, key_mgr); + } + + Ok(()) + } + + pub fn run_node(&mut self, node_id: u64) -> ServerResult<()> { + debug!("starting node {}", node_id); + let tablet_registry = self.tablet_registries[&node_id].clone(); + let raft_engine = self.raft_engines[&node_id].clone(); + let cfg = self.cfg.clone(); + + // if let Some(labels) = self.labels.get(&node_id) { + // cfg.server.labels = labels.to_owned(); + // } + let store_meta = match self.store_metas.entry(node_id) { + MapEntry::Occupied(o) => { + let mut meta = o.get().lock().unwrap(); + *meta = StoreMeta::new(node_id); + o.get().clone() + } + MapEntry::Vacant(v) => v + .insert(Arc::new(Mutex::new(StoreMeta::new(node_id)))) + .clone(), + }; + + let props = GroupProperties::default(); + self.group_props.insert(node_id, props.clone()); + tikv_util::thread_group::set_properties(Some(props)); + + debug!("calling run node"; "node_id" => node_id); + let key_mgr = self.key_managers_map.get(&node_id).unwrap().clone(); + self.sim.wl().run_node( + node_id, + cfg, + store_meta, + key_mgr, + raft_engine, + tablet_registry, + &self.resource_manager, + )?; + debug!("node {} started", node_id); + Ok(()) + } + + pub fn stop_node(&mut self, node_id: u64) { + debug!("stopping node {}", node_id); + self.group_props[&node_id].mark_shutdown(); + + // Simulate shutdown behavior of server shutdown. It's not enough to just set + // the map above as current thread may also query properties during shutdown. + let previous_prop = tikv_util::thread_group::current_properties(); + tikv_util::thread_group::set_properties(Some(self.group_props[&node_id].clone())); + match self.sim.write() { + Ok(mut sim) => sim.stop_node(node_id), + Err(_) => safe_panic!("failed to acquire write lock."), + } + self.pd_client.shutdown_store(node_id); + + let mut regions = vec![]; + let reg = &self.tablet_registries[&node_id]; + reg.for_each_opened_tablet(|region_id, _| { + regions.push(region_id); + true + }); + for region_id in regions { + if let Some(mut tablet) = reg.get(region_id) { + if let Some(tablet) = tablet.latest() { + let mut tried = 0; + while tried < 10 { + if tablet.inner_refcount() <= 3 { + break; + } + thread::sleep(Duration::from_millis(10)); + tried += 1; + } + } + } + reg.remove(region_id); + } + + debug!("node {} stopped", node_id); + tikv_util::thread_group::set_properties(previous_prop); + } + + /// Multiple nodes with fixed node id, like node 1, 2, .. 5, + /// First region 1 is in all stores with peer 1, 2, .. 5. + /// Peer 1 is in node 1, store 1, etc. + /// + /// Must be called after `create_engines`. + pub fn bootstrap_region(&mut self) -> Result<()> { + for (i, (tablet_registry, raft_engine)) in self.engines.iter().enumerate() { + let id = i as u64 + 1; + self.tablet_registries.insert(id, tablet_registry.clone()); + self.raft_engines.insert(id, raft_engine.clone()); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(id))); + self.store_metas.insert(id, store_meta); + self.key_managers_map + .insert(id, self.key_managers[i].clone()); + self.sst_workers_map.insert(id, i); + } + + let mut region = metapb::Region::default(); + region.set_id(1); + region.set_start_key(keys::EMPTY_KEY.to_vec()); + region.set_end_key(keys::EMPTY_KEY.to_vec()); + region.mut_region_epoch().set_version(INIT_EPOCH_VER); + region.mut_region_epoch().set_conf_ver(INIT_EPOCH_CONF_VER); + + for &id in self.raft_engines.keys() { + let peer = new_peer(id, id); + region.mut_peers().push(peer.clone()); + } + + for raft_engine in self.raft_engines.values() { + let mut wb = raft_engine.log_batch(10); + wb.put_prepare_bootstrap_region(®ion)?; + write_initial_states(&mut wb, region.clone())?; + box_try!(raft_engine.consume(&mut wb, true)); + } + + self.bootstrap_cluster(region); + + Ok(()) + } + + pub fn bootstrap_conf_change(&mut self) -> u64 { + for (i, (tablet_registry, raft_engine)) in self.engines.iter().enumerate() { + let id = i as u64 + 1; + self.tablet_registries.insert(id, tablet_registry.clone()); + self.raft_engines.insert(id, raft_engine.clone()); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(id))); + self.store_metas.insert(id, store_meta); + self.key_managers_map + .insert(id, self.key_managers[i].clone()); + self.sst_workers_map.insert(id, i); + } + + let node_id = 1; + let region_id = 1; + let peer_id = 1; + + let region = initial_region(node_id, region_id, peer_id); + let raft_engine = self.raft_engines[&node_id].clone(); + let mut wb = raft_engine.log_batch(10); + wb.put_prepare_bootstrap_region(®ion).unwrap(); + write_initial_states(&mut wb, region.clone()).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + + self.bootstrap_cluster(region); + + region_id + } + + // This is only for fixed id test + fn bootstrap_cluster(&mut self, region: metapb::Region) { + self.pd_client + .bootstrap_cluster(new_store(1, "".to_owned()), region) + .unwrap(); + for id in self.raft_engines.keys() { + let store = new_store(*id, "".to_owned()); + // todo: labels + self.pd_client.put_store(store).unwrap(); + } + } + + pub fn get_engine(&self, node_id: u64) -> WrapFactory { + WrapFactory::new( + self.pd_client.clone(), + self.raft_engines[&node_id].clone(), + self.tablet_registries[&node_id].clone(), + ) + } + + pub fn read( + &self, + // v2 does not need this + _batch_id: Option, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + match self.sim.wl().read(request.clone(), timeout) { + Err(e) => { + warn!("failed to read {:?}: {:?}", request, e); + Err(e) + } + a => a, + } + } + + // mixed read and write requests are not supportted + pub fn call_command( + &mut self, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let mut is_read = false; + let mut not_read = false; + for req in request.get_requests() { + match req.get_cmd_type() { + CmdType::Get | CmdType::Snap | CmdType::ReadIndex => { + is_read = true; + } + _ => { + not_read = true; + } + } + } + let ret = if is_read { + assert!(!not_read); + self.sim.wl().read(request.clone(), timeout) + } else if request.has_status_request() { + self.sim.wl().call_query(request.clone(), timeout) + } else { + self.sim.wl().call_command(request.clone(), timeout) + }; + match ret { + Err(e) => { + warn!("failed to call command {:?}: {:?}", request, e); + Err(e) + } + a => a, + } + } + + pub fn call_command_on_leader( + &mut self, + mut request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let timer = Instant::now(); + let region_id = request.get_header().get_region_id(); + loop { + let leader = match self.leader_of_region(region_id) { + None => return Err(Error::NotLeader(region_id, None)), + Some(l) => l, + }; + request.mut_header().set_peer(leader); + let resp = match self.call_command(request.clone(), timeout) { + e @ Err(_) => return e, + Ok(resp) => resp, + }; + if self.refresh_leader_if_needed(&resp, region_id) + && timer.saturating_elapsed() < timeout + { + warn!( + "{:?} is no longer leader, let's retry", + request.get_header().get_peer() + ); + continue; + } + return Ok(resp); + } + } + + pub fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()> { + self.sim.wl().send_raft_msg(msg) + } + + pub fn call_command_on_node( + &self, + node_id: u64, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + match self + .sim + .rl() + .call_command_on_node(node_id, request.clone(), timeout) + { + Err(e) => { + warn!("failed to call command {:?}: {:?}", request, e); + Err(e) + } + a => a, + } + } + + pub fn leader_of_region(&mut self, region_id: u64) -> Option { + let timer = Instant::now_coarse(); + let timeout = Duration::from_secs(5); + let mut store_ids = None; + while timer.saturating_elapsed() < timeout { + match self.voter_store_ids_of_region(region_id) { + None => thread::sleep(Duration::from_millis(10)), + Some(ids) => { + store_ids = Some(ids); + break; + } + } + } + let store_ids = store_ids?; + if let Some(l) = self.leaders.get(®ion_id) { + // leader may be stopped in some tests. + if self.valid_leader_id(region_id, l.get_store_id()) { + return Some(l.clone()); + } + } + self.reset_leader_of_region(region_id); + let mut leader = None; + let mut leaders = HashMap::default(); + + let node_ids = self.sim.rl().get_node_ids(); + // For some tests, we stop the node but pd still has this information, + // and we must skip this. + let alive_store_ids: Vec<_> = store_ids + .iter() + .filter(|id| node_ids.contains(id)) + .cloned() + .collect(); + while timer.saturating_elapsed() < timeout { + for store_id in &alive_store_ids { + let l = match self.query_leader(*store_id, region_id, Duration::from_secs(1)) { + None => continue, + Some(l) => l, + }; + leaders + .entry(l.get_id()) + .or_insert((l, vec![])) + .1 + .push(*store_id); + } + if let Some((_, (l, c))) = leaders.iter().max_by_key(|(_, (_, c))| c.len()) { + if c.contains(&l.get_store_id()) { + leader = Some(l.clone()); + // Technically, correct calculation should use two quorum when in joint + // state. Here just for simplicity. + if c.len() > store_ids.len() / 2 { + break; + } + } + } + debug!("failed to detect leaders"; "leaders" => ?leaders, "store_ids" => ?store_ids); + sleep_ms(10); + leaders.clear(); + } + + if let Some(l) = leader { + self.leaders.insert(region_id, l); + } + + self.leaders.get(®ion_id).cloned() + } + + pub fn query_leader( + &mut self, + store_id: u64, + region_id: u64, + timeout: Duration, + ) -> Option { + // To get region leader, we don't care real peer id, so use 0 instead. + let peer = new_peer(store_id, 0); + let find_leader = new_status_request(region_id, peer, new_region_leader_cmd()); + let mut resp = match self.call_command(find_leader, timeout) { + Ok(resp) => resp, + Err(err) => { + error!( + "fail to get leader of region {} on store {}, error: {:?}", + region_id, store_id, err + ); + return None; + } + }; + let mut region_leader = resp.take_status_response().take_region_leader(); + // NOTE: node id can't be 0. + if self.valid_leader_id(region_id, region_leader.get_leader().get_store_id()) { + Some(region_leader.take_leader()) + } else { + None + } + } + + fn valid_leader_id(&self, region_id: u64, leader_store_id: u64) -> bool { + let store_ids = match self.voter_store_ids_of_region(region_id) { + None => return false, + Some(ids) => ids, + }; + let node_ids = self.sim.rl().get_node_ids(); + store_ids.contains(&leader_store_id) && node_ids.contains(&leader_store_id) + } + + fn voter_store_ids_of_region(&self, region_id: u64) -> Option> { + block_on(self.pd_client.get_region_by_id(region_id)) + .unwrap() + .map(|region| { + region + .get_peers() + .iter() + .flat_map(|p| { + if p.get_role() != PeerRole::Learner { + Some(p.get_store_id()) + } else { + None + } + }) + .collect() + }) + } + + pub fn reset_leader_of_region(&mut self, region_id: u64) { + self.leaders.remove(®ion_id); + } + + // If the resp is "not leader error", get the real leader. + // Otherwise reset or refresh leader if needed. + // Returns if the request should retry. + fn refresh_leader_if_needed(&mut self, resp: &RaftCmdResponse, region_id: u64) -> bool { + if !is_error_response(resp) { + return false; + } + + let err = resp.get_header().get_error(); + if err + .get_message() + .contains("peer has not applied to current term") + { + // leader peer has not applied to current term + return true; + } + + // If command is stale, leadership may have changed. + // EpochNotMatch is not checked as leadership is checked first in raftstore. + if err.has_stale_command() { + self.reset_leader_of_region(region_id); + return true; + } + + if !err.has_not_leader() { + return false; + } + let err = err.get_not_leader(); + if !err.has_leader() { + self.reset_leader_of_region(region_id); + return true; + } + self.leaders.insert(region_id, err.get_leader().clone()); + true + } + + pub fn request( + &mut self, + key: &[u8], + reqs: Vec, + read_quorum: bool, + timeout: Duration, + ) -> RaftCmdResponse { + let timer = Instant::now(); + let mut tried_times = 0; + while tried_times < 2 || timer.saturating_elapsed() < timeout { + tried_times += 1; + let mut region = self.get_region(key); + let region_id = region.get_id(); + let req = new_request( + region_id, + region.take_region_epoch(), + reqs.clone(), + read_quorum, + ); + let result = self.call_command_on_leader(req, timeout); + + let resp = match result { + e @ Err(Error::Timeout(_)) + | e @ Err(Error::NotLeader(..)) + | e @ Err(Error::StaleCommand) => { + warn!("call command failed, retry it"; "err" => ?e); + sleep_ms(100); + continue; + } + Err(e) => panic!("call command failed {:?}", e), + Ok(resp) => resp, + }; + + if resp.get_header().get_error().has_epoch_not_match() { + warn!("seems split, let's retry"); + sleep_ms(100); + continue; + } + if resp + .get_header() + .get_error() + .get_message() + .contains("merging mode") + { + warn!("seems waiting for merge, let's retry"); + sleep_ms(100); + continue; + } + return resp; + } + panic!("request timeout"); + } + + pub fn get_region(&self, key: &[u8]) -> metapb::Region { + self.get_region_with(key, |_| true) + } + + pub fn get_region_id(&self, key: &[u8]) -> u64 { + self.get_region(key).get_id() + } + + // Get region ids of all opened tablets in a store + pub fn region_ids(&self, store_id: u64) -> Vec { + let mut ids = vec![]; + let registry = self.tablet_registries.get(&store_id).unwrap(); + registry.for_each_opened_tablet(|id, _| -> bool { + ids.push(id); + true + }); + ids + } + + pub fn scan( + &self, + store_id: u64, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + mut f: F, + ) -> engine_traits::Result<()> + where + F: FnMut(&[u8], &[u8]) -> engine_traits::Result, + { + let region_ids = self.region_ids(store_id); + for id in region_ids { + self.scan_region(store_id, id, cf, start_key, end_key, fill_cache, &mut f)?; + } + Ok(()) + } + + // start_key and end_key should be `data key` + fn scan_region( + &self, + store_id: u64, + region_id: u64, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + f: F, + ) -> engine_traits::Result<()> + where + F: FnMut(&[u8], &[u8]) -> engine_traits::Result, + { + let tablet_registry = self.tablet_registries.get(&store_id).unwrap(); + let tablet = tablet_registry + .get(region_id) + .unwrap() + .latest() + .unwrap() + .clone(); + + let region = block_on(self.pd_client.get_region_by_id(region_id)) + .unwrap() + .unwrap(); + let region_start_key: &[u8] = &data_key(region.get_start_key()); + let region_end_key: &[u8] = &data_key(region.get_end_key()); + + let amended_start_key = if start_key > region_start_key { + start_key + } else { + region_start_key + }; + let amended_end_key = if end_key < region_end_key || region_end_key.is_empty() { + end_key + } else { + region_end_key + }; + + if amended_start_key > amended_end_key { + return Ok(()); + } + + tablet.scan(cf, amended_start_key, amended_end_key, fill_cache, f) + } + + pub fn get_raft_engine(&self, node_id: u64) -> RaftTestEngine { + self.raft_engines[&node_id].clone() + } + + pub fn get_region_epoch(&self, region_id: u64) -> RegionEpoch { + block_on(self.pd_client.get_region_by_id(region_id)) + .unwrap() + .unwrap() + .take_region_epoch() + } + + pub fn region_detail(&mut self, region_id: u64, store_id: u64) -> RegionDetailResponse { + let status_cmd = new_region_detail_cmd(); + let peer = new_peer(store_id, 0); + let req = new_status_request(region_id, peer, status_cmd); + let resp = self.call_command(req, Duration::from_secs(5)); + assert!(resp.is_ok(), "{:?}", resp); + + let mut resp = resp.unwrap(); + assert!(resp.has_status_response()); + let mut status_resp = resp.take_status_response(); + assert_eq!(status_resp.get_cmd_type(), StatusCmdType::RegionDetail); + assert!(status_resp.has_region_detail()); + status_resp.take_region_detail() + } + + pub fn truncated_state(&self, region_id: u64, store_id: u64) -> RaftTruncatedState { + self.apply_state(region_id, store_id).take_truncated_state() + } + + pub fn wait_log_truncated(&self, region_id: u64, store_id: u64, index: u64) { + let timer = Instant::now(); + loop { + let truncated_state = self.truncated_state(region_id, store_id); + if truncated_state.get_index() >= index { + return; + } + if timer.saturating_elapsed() >= Duration::from_secs(5) { + panic!( + "[region {}] log is still not truncated to {}: {:?} on store {}", + region_id, index, truncated_state, store_id, + ); + } + thread::sleep(Duration::from_millis(10)); + } + } + + pub fn get(&mut self, key: &[u8]) -> Option> { + self.get_impl(CF_DEFAULT, key, false) + } + + pub fn get_cf(&mut self, cf: &str, key: &[u8]) -> Option> { + self.get_impl(cf, key, false) + } + + pub fn must_get(&mut self, key: &[u8]) -> Option> { + self.get_impl(CF_DEFAULT, key, true) + } + + fn get_impl(&mut self, cf: &str, key: &[u8], read_quorum: bool) -> Option> { + let mut resp = self.request( + key, + vec![new_get_cf_cmd(cf, key)], + read_quorum, + Duration::from_secs(5), + ); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + assert_eq!(resp.get_responses().len(), 1); + assert_eq!(resp.get_responses()[0].get_cmd_type(), CmdType::Get); + if resp.get_responses()[0].has_get() { + Some(resp.mut_responses()[0].mut_get().take_value()) + } else { + None + } + } + + // Flush the cf of all opened tablets + pub fn must_flush_cf(&mut self, cf: &str, sync: bool) { + for registry in self.tablet_registries.values() { + registry.for_each_opened_tablet(|_id, cached_tablet| -> bool { + if let Some(db) = cached_tablet.latest() { + db.flush_cf(cf, sync).unwrap(); + } + true + }); + } + } + + // Get region when the `filter` returns true. + pub fn get_region_with(&self, key: &[u8], filter: F) -> metapb::Region + where + F: Fn(&metapb::Region) -> bool, + { + for _ in 0..100 { + if let Ok(region) = self.pd_client.get_region(key) { + if filter(®ion) { + return region; + } + } + // We may meet range gap after split, so here we will + // retry to get the region again. + sleep_ms(20); + } + + panic!("find no region for {}", log_wrappers::hex_encode_upper(key)); + } + + pub fn must_put(&mut self, key: &[u8], value: &[u8]) { + self.must_put_cf(CF_DEFAULT, key, value); + } + + pub fn must_put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) { + if let Err(e) = self.batch_put(key, vec![new_put_cf_cmd(cf, key, value)]) { + panic!("has error: {:?}", e); + } + } + + pub fn put(&mut self, key: &[u8], value: &[u8]) -> result::Result<(), PbError> { + self.batch_put(key, vec![new_put_cf_cmd(CF_DEFAULT, key, value)]) + .map(|_| ()) + } + + pub fn batch_put( + &mut self, + region_key: &[u8], + reqs: Vec, + ) -> result::Result { + let resp = self.request(region_key, reqs, false, Duration::from_secs(5)); + if resp.get_header().has_error() { + Err(resp.get_header().get_error().clone()) + } else { + Ok(resp) + } + } + + pub fn must_delete(&mut self, key: &[u8]) { + self.must_delete_cf(CF_DEFAULT, key) + } + + pub fn must_delete_cf(&mut self, cf: &str, key: &[u8]) { + let resp = self.request( + key, + vec![new_delete_cmd(cf, key)], + false, + Duration::from_secs(5), + ); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + } + + pub fn must_delete_range_cf(&mut self, cf: &str, start: &[u8], end: &[u8]) { + let resp = self.request( + start, + vec![new_delete_range_cmd(cf, start, end)], + false, + Duration::from_secs(5), + ); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + } + + pub fn must_notify_delete_range_cf(&mut self, cf: &str, start: &[u8], end: &[u8]) { + let mut req = new_delete_range_cmd(cf, start, end); + req.mut_delete_range().set_notify_only(true); + let resp = self.request(start, vec![req], false, Duration::from_secs(5)); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + } + + pub fn apply_state(&self, region_id: u64, store_id: u64) -> RaftApplyState { + self.get_engine(store_id) + .raft_apply_state(region_id) + .unwrap() + .unwrap() + } + + pub fn add_send_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_send_filter(node_id, filter); + } + + pub fn clear_send_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_send_filters(node_id); + } + + pub fn add_recv_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_recv_filter(node_id, filter); + } + + pub fn clear_recv_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_recv_filters(node_id); + } + + pub fn add_send_filter(&self, factory: F) { + let mut sim = self.sim.wl(); + for node_id in sim.get_node_ids() { + for filter in factory.generate(node_id) { + sim.add_send_filter(node_id, filter); + } + } + } + + pub fn clear_send_filters(&self) { + let mut sim = self.sim.wl(); + for node_id in sim.get_node_ids() { + sim.clear_send_filters(node_id); + } + } + + // it's so common that we provide an API for it + pub fn partition(&mut self, s1: Vec, s2: Vec) { + self.add_send_filter(PartitionFilterFactory::new(s1, s2)); + } + + pub fn transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { + let epoch = self.get_region_epoch(region_id); + let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); + // todo(SpadeA): modify + let resp = self + .call_command_on_leader(transfer_leader, Duration::from_secs(500)) + .unwrap(); + assert_eq!( + resp.get_admin_response().get_cmd_type(), + AdminCmdType::TransferLeader, + "{:?}", + resp + ); + } + + pub fn must_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { + let timer = Instant::now(); + loop { + self.reset_leader_of_region(region_id); + let cur_leader = self.leader_of_region(region_id); + if let Some(ref cur_leader) = cur_leader { + if cur_leader.get_id() == leader.get_id() + && cur_leader.get_store_id() == leader.get_store_id() + { + return; + } + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "failed to transfer leader to [{}] {:?}, current leader: {:?}", + region_id, leader, cur_leader + ); + } + self.transfer_leader(region_id, leader.clone()); + } + } + + pub fn try_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) -> RaftCmdResponse { + let epoch = self.get_region_epoch(region_id); + let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); + self.call_command_on_leader(transfer_leader, Duration::from_secs(5)) + .unwrap() + } + + // It's similar to `ask_split`, the difference is the msg, it sends, is + // `Msg::SplitRegion`, and `region` will not be embedded to that msg. + // Caller must ensure that the `split_key` is in the `region`. + pub fn split_region( + &mut self, + region: &metapb::Region, + split_key: &[u8], + mut cb: Callback, + ) { + let leader = self.leader_of_region(region.get_id()).unwrap(); + let router = self.sim.rl().get_router(leader.get_store_id()).unwrap(); + let split_key = split_key.to_vec(); + let (split_region_req, mut sub) = PeerMsg::request_split( + region.get_region_epoch().clone(), + vec![split_key], + "test".into(), + ); + + router + .check_send(region.get_id(), split_region_req) + .unwrap(); + + block_on(async { + sub.wait_proposed().await; + cb.invoke_proposed(); + sub.wait_committed().await; + cb.invoke_committed(); + let res = sub.result().await.unwrap(); + cb.invoke_with_response(res) + }); + } + + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { + let mut try_cnt = 0; + let split_count = self.pd_client.get_split_count(); + loop { + debug!("asking split"; "region" => ?region, "key" => ?split_key); + // In case ask split message is ignored, we should retry. + if try_cnt % 50 == 0 { + self.reset_leader_of_region(region.get_id()); + let key = split_key.to_vec(); + let check = Box::new(move |write_resp: WriteResponse| { + let mut resp = write_resp.response; + if resp.get_header().has_error() { + let error = resp.get_header().get_error(); + if error.has_epoch_not_match() + || error.has_not_leader() + || error.has_stale_command() + || error + .get_message() + .contains("peer has not applied to current term") + { + warn!("fail to split: {:?}, ignore.", error); + return; + } + panic!("failed to split: {:?}", resp); + } + let admin_resp = resp.mut_admin_response(); + let split_resp = admin_resp.mut_splits(); + let regions = split_resp.get_regions(); + assert_eq!(regions.len(), 2); + assert_eq!(regions[0].get_end_key(), key.as_slice()); + assert_eq!(regions[0].get_end_key(), regions[1].get_start_key()); + }); + if self.leader_of_region(region.get_id()).is_some() { + self.split_region(region, split_key, Callback::write(check)); + } + } + + if self.pd_client.check_split(region, split_key) + && self.pd_client.get_split_count() > split_count + { + return; + } + + if try_cnt > 250 { + panic!( + "region {:?} has not been split by {}", + region, + log_wrappers::hex_encode_upper(split_key) + ); + } + try_cnt += 1; + sleep_ms(20); + } + } + + pub fn wait_region_split(&mut self, region: &metapb::Region) { + self.wait_region_split_max_cnt(region, 20, 250, true); + } + + pub fn wait_region_split_max_cnt( + &mut self, + region: &metapb::Region, + itvl_ms: u64, + max_try_cnt: u64, + is_panic: bool, + ) { + let mut try_cnt = 0; + let split_count = self.pd_client.get_split_count(); + loop { + if self.pd_client.get_split_count() > split_count { + match self.pd_client.get_region(region.get_start_key()) { + Err(_) => {} + Ok(left) => { + if left.get_end_key() != region.get_end_key() { + return; + } + } + } + } + + if try_cnt > max_try_cnt { + if is_panic { + panic!( + "region {:?} has not been split after {}ms", + region, + max_try_cnt * itvl_ms + ); + } else { + return; + } + } + try_cnt += 1; + sleep_ms(itvl_ms); + } + } + + fn new_prepare_merge(&self, source: u64, target: u64) -> RaftCmdRequest { + let region = block_on(self.pd_client.get_region_by_id(target)) + .unwrap() + .unwrap(); + let prepare_merge = new_prepare_merge(region); + let source_region = block_on(self.pd_client.get_region_by_id(source)) + .unwrap() + .unwrap(); + new_admin_request( + source_region.get_id(), + source_region.get_region_epoch(), + prepare_merge, + ) + } + + pub fn merge_region(&mut self, source: u64, target: u64, _cb: Callback) { + // FIXME: callback is ignored. + let mut req = self.new_prepare_merge(source, target); + let leader = self.leader_of_region(source).unwrap(); + req.mut_header().set_peer(leader.clone()); + self.sim + .rl() + .async_command_on_node(leader.get_store_id(), req); + } + + pub fn try_merge(&mut self, source: u64, target: u64) -> RaftCmdResponse { + self.call_command_on_leader( + self.new_prepare_merge(source, target), + Duration::from_secs(5), + ) + .unwrap() + } + + pub fn must_try_merge(&mut self, source: u64, target: u64) { + let resp = self.try_merge(source, target); + if is_error_response(&resp) { + panic!( + "{} failed to try merge to {}, resp {:?}", + source, target, resp + ); + } + } + + /// Make sure region not exists on that store. + pub fn must_region_not_exist(&mut self, region_id: u64, store_id: u64) { + let mut try_cnt = 0; + loop { + let status_cmd = new_region_detail_cmd(); + let peer = new_peer(store_id, 0); + let req = new_status_request(region_id, peer, status_cmd); + let resp = self.call_command(req, Duration::from_secs(5)).unwrap(); + if resp.get_header().has_error() && resp.get_header().get_error().has_region_not_found() + { + return; + } + + if try_cnt > 250 { + panic!( + "region {} still exists on store {} after {} tries: {:?}", + region_id, store_id, try_cnt, resp + ); + } + try_cnt += 1; + sleep_ms(20); + } + } + + pub fn get_snap_dir(&self, node_id: u64) -> String { + self.sim.rl().get_snap_dir(node_id) + } + + pub fn get_snap_mgr(&self, node_id: u64) -> TabletSnapManager { + self.sim.rl().get_snap_mgr(node_id).clone() + } + + pub fn get_router(&self, node_id: u64) -> Option> { + self.sim.rl().get_router(node_id) + } + + pub fn refresh_region_bucket_keys( + &mut self, + _region: &metapb::Region, + _buckets: Vec, + _bucket_ranges: Option>, + _expect_buckets: Option, + ) -> u64 { + unimplemented!() + } + + pub fn send_half_split_region_message( + &mut self, + _region: &metapb::Region, + _expected_bucket_ranges: Option>, + ) { + unimplemented!() + } + + pub fn wait_tombstone(&self, region_id: u64, peer: metapb::Peer, check_exist: bool) { + let timer = Instant::now(); + let mut state; + loop { + state = self.region_local_state(region_id, peer.get_store_id()); + if state.get_state() == PeerState::Tombstone + && (!check_exist || state.get_region().get_peers().contains(&peer)) + { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + break; + } + thread::sleep(Duration::from_millis(10)); + } + panic!( + "{:?} is still not gc in region {} {:?}", + peer, region_id, state + ); + } + + pub fn wait_destroy_and_clean(&self, region_id: u64, peer: metapb::Peer) { + let timer = Instant::now(); + self.wait_tombstone(region_id, peer.clone(), false); + let mut state; + loop { + state = self.get_raft_local_state(region_id, peer.get_store_id()); + if state.is_none() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + break; + } + thread::sleep(Duration::from_millis(10)); + } + panic!( + "{:?} is still not cleaned in region {} {:?}", + peer, region_id, state + ); + } + + pub fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { + self.get_engine(store_id) + .region_local_state(region_id) + .unwrap() + .unwrap() + } + + pub fn get_raft_local_state(&self, region_id: u64, store_id: u64) -> Option { + self.get_engine(store_id) + .raft_local_state(region_id) + .unwrap() + } + + pub fn raft_local_state(&self, region_id: u64, store_id: u64) -> RaftLocalState { + self.get_raft_local_state(region_id, store_id).unwrap() + } + + pub fn shutdown(&mut self) { + debug!("about to shutdown cluster"); + let keys = match self.sim.read() { + Ok(s) => s.get_node_ids(), + Err(_) => { + safe_panic!("failed to acquire read lock"); + // Leave the resource to avoid double panic. + return; + } + }; + for id in keys { + self.stop_node(id); + } + self.leaders.clear(); + for store_meta in self.store_metas.values() { + while Arc::strong_count(store_meta) != 1 { + std::thread::sleep(Duration::from_millis(10)); + } + } + self.store_metas.clear(); + for sst_worker in self.sst_workers.drain(..) { + sst_worker.stop_worker(); + } + + debug!("all nodes are shut down."); + } +} + +pub fn bootstrap_store( + raft_engine: &ER, + cluster_id: u64, + store_id: u64, +) -> Result<()> { + let mut ident = StoreIdent::default(); + + if !raft_engine.is_empty()? { + return Err(box_err!("store is not empty and has already had data")); + } + + ident.set_cluster_id(cluster_id); + ident.set_store_id(store_id); + + let mut lb = raft_engine.log_batch(1); + lb.put_store_ident(&ident)?; + raft_engine.consume(&mut lb, true)?; + + Ok(()) +} + +impl, EK: KvEngine> Drop for Cluster { + fn drop(&mut self) { + test_util::clear_failpoints(); + self.shutdown(); + } +} + +pub struct WrapFactory { + pd_client: Arc, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, +} + +impl WrapFactory { + pub fn new( + pd_client: Arc, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + ) -> Self { + Self { + raft_engine, + tablet_registry, + pd_client, + } + } + + fn region_id_of_key(&self, mut key: &[u8]) -> u64 { + assert!(validate_data_key(key)); + key = &key[DATA_PREFIX_KEY.len()..]; + self.pd_client.get_region(key).unwrap().get_id() + } + + fn get_tablet(&self, key: &[u8]) -> Option { + // todo: unwrap + let region_id = self.region_id_of_key(key); + self.tablet_registry.get(region_id)?.latest().cloned() + } + + pub fn get_tablet_by_id(&self, id: u64) -> Option { + self.tablet_registry.get(id)?.latest().cloned() + } +} + +impl Peekable for WrapFactory { + type DbVector = EK::DbVector; + + fn get_value_opt( + &self, + opts: &ReadOptions, + key: &[u8], + ) -> engine_traits::Result> { + let region_id = self.region_id_of_key(key); + + if let Ok(Some(state)) = self.region_local_state(region_id) { + if state.state == PeerState::Tombstone { + return Ok(None); + } + } + + match self.get_tablet(key) { + Some(tablet) => tablet.get_value_opt(opts, key), + _ => Ok(None), + } + } + + fn get_value_cf_opt( + &self, + opts: &ReadOptions, + cf: &str, + key: &[u8], + ) -> engine_traits::Result> { + let region_id = self.region_id_of_key(key); + + if let Ok(Some(state)) = self.region_local_state(region_id) { + if state.state == PeerState::Tombstone { + return Ok(None); + } + } + + match self.get_tablet(key) { + Some(tablet) => tablet.get_value_cf_opt(opts, cf, key), + _ => Ok(None), + } + } + + fn get_msg_cf( + &self, + _cf: &str, + _key: &[u8], + ) -> engine_traits::Result> { + unimplemented!() + } +} + +impl SyncMutable for WrapFactory { + fn put(&self, key: &[u8], value: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.put(key, value), + _ => unimplemented!(), + } + } + + fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.put_cf(cf, key, value), + _ => unimplemented!(), + } + } + + fn delete(&self, key: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.delete(key), + _ => unimplemented!(), + } + } + + fn delete_cf(&self, cf: &str, key: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.delete_cf(cf, key), + _ => unimplemented!(), + } + } + + fn delete_range(&self, _begin_key: &[u8], _end_key: &[u8]) -> engine_traits::Result<()> { + unimplemented!() + } + + fn delete_range_cf( + &self, + _cf: &str, + _begin_key: &[u8], + _end_key: &[u8], + ) -> engine_traits::Result<()> { + unimplemented!() + } +} + +impl RawEngine for WrapFactory { + fn region_local_state( + &self, + region_id: u64, + ) -> engine_traits::Result> { + self.raft_engine.get_region_state(region_id, u64::MAX) + } + + fn raft_apply_state(&self, region_id: u64) -> engine_traits::Result> { + self.raft_engine.get_apply_state(region_id, u64::MAX) + } + + fn raft_local_state(&self, region_id: u64) -> engine_traits::Result> { + self.raft_engine.get_raft_state(region_id) + } +} diff --git a/components/test_raftstore-v2/src/lib.rs b/components/test_raftstore-v2/src/lib.rs new file mode 100644 index 00000000000..685affe45d0 --- /dev/null +++ b/components/test_raftstore-v2/src/lib.rs @@ -0,0 +1,13 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(incomplete_features)] +#![feature(type_alias_impl_trait)] +#![feature(return_position_impl_trait_in_trait)] +#![feature(let_chains)] + +mod cluster; +mod node; +mod server; +mod transport_simulate; +pub mod util; + +pub use crate::{cluster::*, node::*, server::*, transport_simulate::*, util::*}; diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs new file mode 100644 index 00000000000..ffa38b51796 --- /dev/null +++ b/components/test_raftstore-v2/src/node.rs @@ -0,0 +1,485 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + path::Path, + sync::{Arc, Mutex, RwLock}, +}; + +use collections::{HashMap, HashSet}; +use concurrency_manager::ConcurrencyManager; +use encryption_export::DataKeyManager; +use engine_rocks::RocksEngine; +use engine_test::raft::RaftTestEngine; +use engine_traits::{KvEngine, RaftEngine, RaftEngineReadOnly, TabletRegistry}; +use futures::Future; +use kvproto::{ + kvrpcpb::ApiVersion, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use raft::{prelude::MessageType, SnapshotStatus}; +use raftstore::{ + coprocessor::CoprocessorHost, + errors::Error as RaftError, + store::{ + AutoSplitController, GlobalReplicationState, RegionSnapshot, SplitConfigManager, + TabletSnapKey, TabletSnapManager, Transport, + }, + Result, +}; +use raftstore_v2::{ + router::{PeerMsg, RaftRouter}, + StateStorage, StoreMeta, StoreRouter, +}; +use resource_control::ResourceGroupManager; +use resource_metering::CollectorRegHandle; +use tempfile::TempDir; +use test_pd_client::TestPdClient; +use test_raftstore::{Config, Filter}; +use tikv::{ + config::{ConfigController, Module}, + import::SstImporter, + server::{ + raftkv::ReplicaReadLockChecker, tablet_snap::copy_tablet_snapshot, NodeV2, + Result as ServerResult, + }, +}; +use tikv_util::{ + box_err, + config::VersionTrack, + worker::{Builder as WorkerBuilder, LazyWorker}, +}; + +use crate::{Cluster, RaftStoreRouter, SimulateTransport, Simulator, SnapshotRouter}; + +#[derive(Clone)] +pub struct ChannelTransport { + core: Arc>>, +} + +impl ChannelTransport { + pub fn new() -> Self { + ChannelTransport { + core: Arc::new(Mutex::new(ChannelTransportCore { + snap_paths: HashMap::default(), + routers: HashMap::default(), + })), + } + } + + pub fn core(&self) -> &Arc>> { + &self.core + } +} + +impl Transport for ChannelTransport { + fn send(&mut self, msg: RaftMessage) -> raftstore::Result<()> { + let from_store = msg.get_from_peer().get_store_id(); + let to_store = msg.get_to_peer().get_store_id(); + let to_peer_id = msg.get_to_peer().get_id(); + let region_id = msg.get_region_id(); + let is_snapshot = msg.get_message().get_msg_type() == MessageType::MsgSnapshot; + + if is_snapshot { + let snap = msg.get_message().get_snapshot(); + let key = TabletSnapKey::from_region_snap( + msg.get_region_id(), + msg.get_to_peer().get_id(), + snap, + ); + let sender_snap_mgr = match self.core.lock().unwrap().snap_paths.get(&from_store) { + Some(snap_mgr) => snap_mgr.0.clone(), + None => return Err(box_err!("missing snap manager for store {}", from_store)), + }; + let recver_snap_mgr = match self.core.lock().unwrap().snap_paths.get(&to_store) { + Some(snap_mgr) => snap_mgr.0.clone(), + None => return Err(box_err!("missing snap manager for store {}", to_store)), + }; + + if let Err(e) = + copy_tablet_snapshot(key, msg.clone(), &sender_snap_mgr, &recver_snap_mgr) + { + return Err(box_err!("copy tablet snapshot failed: {:?}", e)); + } + } + + let core = self.core.lock().unwrap(); + match core.routers.get(&to_store) { + Some(h) => { + h.send_raft_msg(msg)?; + if is_snapshot { + let _ = core.routers[&from_store].report_snapshot_status( + region_id, + to_peer_id, + SnapshotStatus::Finish, + ); + } + Ok(()) + } + _ => Err(box_err!("missing sender for store {}", to_store)), + } + } + + fn set_store_allowlist(&mut self, _allowlist: Vec) { + unimplemented!(); + } + + fn need_flush(&self) -> bool { + false + } + + fn flush(&mut self) {} +} + +pub struct ChannelTransportCore { + pub snap_paths: HashMap, + pub routers: HashMap>>, +} + +impl Default for ChannelTransport { + fn default() -> Self { + Self::new() + } +} + +type SimulateChannelTransport = SimulateTransport>; + +pub struct NodeCluster { + trans: ChannelTransport, + pd_client: Arc, + nodes: HashMap>, + simulate_trans: HashMap>, + concurrency_managers: HashMap, + snap_mgrs: HashMap, +} + +impl NodeCluster { + pub fn new(pd_client: Arc) -> Self { + NodeCluster { + trans: ChannelTransport::new(), + pd_client, + nodes: HashMap::default(), + simulate_trans: HashMap::default(), + concurrency_managers: HashMap::default(), + snap_mgrs: HashMap::default(), + } + } +} + +impl Simulator for NodeCluster { + fn get_node_ids(&self) -> HashSet { + self.nodes.keys().cloned().collect() + } + + fn add_send_filter(&mut self, node_id: u64, filter: Box) { + self.simulate_trans + .get_mut(&node_id) + .unwrap() + .add_filter(filter); + } + + fn clear_send_filters(&mut self, node_id: u64) { + self.simulate_trans + .get_mut(&node_id) + .unwrap() + .clear_filters(); + } + + fn run_node( + &mut self, + node_id: u64, + cfg: Config, + store_meta: Arc>>, + key_manager: Option>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + _resource_manager: &Option>, + ) -> ServerResult { + assert!(!self.nodes.contains_key(&node_id)); + let pd_worker = LazyWorker::new("test-pd-worker"); + + let simulate_trans = SimulateTransport::new(self.trans.clone()); + let mut raft_store = cfg.raft_store.clone(); + raft_store + .validate( + cfg.coprocessor.region_split_size(), + cfg.coprocessor.enable_region_bucket(), + cfg.coprocessor.region_bucket_size, + ) + .unwrap(); + + let mut node = NodeV2::new(&cfg.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&raft_store, &raft_engine).unwrap(); + assert_eq!(node.id(), node_id); + + tablet_registry + .tablet_factory() + .set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + node.router().clone(), + ))); + + // todo: node id 0 + let (snap_mgr, snap_mgs_path) = if node_id == 0 + || !self + .trans + .core + .lock() + .unwrap() + .snap_paths + .contains_key(&node_id) + { + let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); + let snap_path = tmp.path().to_str().unwrap().to_owned(); + ( + TabletSnapManager::new(snap_path, key_manager.clone())?, + Some(tmp), + ) + } else { + let trans = self.trans.core.lock().unwrap(); + let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; + (snap_mgr.clone(), None) + }; + self.snap_mgrs.insert(node_id, snap_mgr.clone()); + + let raft_router = RaftRouter::new_with_store_meta(node.router().clone(), store_meta); + // Create coprocessor. + let mut coprocessor_host = + CoprocessorHost::new(raft_router.store_router().clone(), cfg.coprocessor.clone()); + + // if let Some(f) = self.post_create_coprocessor_host.as_ref() { + // f(node_id, &mut coprocessor_host); + // } + + let cm = ConcurrencyManager::new(1.into()); + self.concurrency_managers.insert(node_id, cm.clone()); + + ReplicaReadLockChecker::new(cm.clone()).register(&mut coprocessor_host); + + let cfg_controller = ConfigController::new(cfg.tikv.clone()); + // cfg_controller.register( + // Module::Coprocessor, + // Box::new(SplitCheckConfigManager(split_scheduler.clone())), + // ); + + let split_config_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(cfg.tikv.split.clone()))); + cfg_controller.register(Module::Split, Box::new(split_config_manager.clone())); + + let auto_split_controller = AutoSplitController::new( + split_config_manager, + cfg.tikv.server.grpc_concurrency, + cfg.tikv.readpool.unified.max_thread_count, + // todo: Is None sufficient for test? + None, + ); + let importer = { + let dir = Path::new(raft_engine.get_engine_path()).join("../import-sst"); + Arc::new( + SstImporter::new( + &cfg.import, + dir, + key_manager.clone(), + cfg.storage.api_version(), + ) + .unwrap(), + ) + }; + + let bg_worker = WorkerBuilder::new("background").thread_count(2).create(); + let state: Arc> = Arc::default(); + node.start( + raft_engine.clone(), + tablet_registry, + &raft_router, + simulate_trans.clone(), + snap_mgr.clone(), + cm, + None, + coprocessor_host, + auto_split_controller, + CollectorRegHandle::new_for_test(), + bg_worker, + pd_worker, + Arc::new(VersionTrack::new(raft_store)), + &state, + importer, + key_manager, + )?; + assert!( + raft_engine + .get_prepare_bootstrap_region() + .unwrap() + .is_none() + ); + assert!(node_id == 0 || node_id == node.id()); + let node_id = node.id(); + + let region_split_size = cfg.coprocessor.region_split_size(); + let enable_region_bucket = cfg.coprocessor.enable_region_bucket(); + let region_bucket_size = cfg.coprocessor.region_bucket_size; + let mut raftstore_cfg = cfg.tikv.raft_store; + raftstore_cfg + .validate(region_split_size, enable_region_bucket, region_bucket_size) + .unwrap(); + + // let raft_store = Arc::new(VersionTrack::new(raftstore_cfg)); + // cfg_controller.register( + // Module::Raftstore, + // Box::new(RaftstoreConfigManager::new( + // node.refresh_config_scheduler(), + // raft_store, + // )), + // ); + + if let Some(tmp) = snap_mgs_path { + self.trans + .core + .lock() + .unwrap() + .snap_paths + .insert(node_id, (snap_mgr, tmp)); + } + + self.trans + .core + .lock() + .unwrap() + .routers + .insert(node_id, SimulateTransport::new(raft_router)); + + self.nodes.insert(node_id, node); + self.simulate_trans.insert(node_id, simulate_trans); + Ok(node_id) + } + + fn async_snapshot( + &mut self, + request: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send + { + let node_id = request.get_header().get_peer().get_store_id(); + if !self + .trans + .core + .lock() + .unwrap() + .routers + .contains_key(&node_id) + { + let mut resp = RaftCmdResponse::default(); + let e: RaftError = box_err!("missing sender for store {}", node_id); + resp.mut_header().set_error(e.into()); + // return async move {Err(resp)}; + } + + let mut router = { + let mut guard = self.trans.core.lock().unwrap(); + guard.routers.get_mut(&node_id).unwrap().clone() + }; + + router.snapshot(request) + } + + fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()> { + if !self + .trans + .core + .lock() + .unwrap() + .routers + .contains_key(&node_id) + { + return Err(box_err!("missing sender for store {}", node_id)); + } + + let router = self + .trans + .core + .lock() + .unwrap() + .routers + .get(&node_id) + .cloned() + .unwrap(); + + router.send_peer_msg(region_id, msg) + } + + fn stop_node(&mut self, node_id: u64) { + if let Some(mut node) = self.nodes.remove(&node_id) { + node.stop(); + } + self.trans + .core + .lock() + .unwrap() + .routers + .remove(&node_id) + .unwrap(); + } + + fn get_router(&self, node_id: u64) -> Option> { + self.nodes.get(&node_id).map(|node| node.router().clone()) + } + + fn get_snap_dir(&self, node_id: u64) -> String { + self.trans.core.lock().unwrap().snap_paths[&node_id] + .0 + .root_path() + .to_str() + .unwrap() + .to_owned() + } + + fn get_snap_mgr(&self, node_id: u64) -> &TabletSnapManager { + self.snap_mgrs.get(&node_id).unwrap() + } + + fn add_recv_filter(&mut self, node_id: u64, filter: Box) { + let mut trans = self.trans.core.lock().unwrap(); + trans.routers.get_mut(&node_id).unwrap().add_filter(filter); + } + + fn clear_recv_filters(&mut self, node_id: u64) { + let mut trans = self.trans.core.lock().unwrap(); + trans.routers.get_mut(&node_id).unwrap().clear_filters(); + } + + fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()> { + self.trans.send(msg) + } +} + +// Compare to server cluster, node cluster does not have server layer and +// storage layer. +pub fn new_node_cluster(id: u64, count: usize) -> Cluster, RocksEngine> { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(&crate::create_test_engine), + ) +} + +// This cluster does not support batch split, we expect it to transfer the +// `BatchSplit` request to `split` request +pub fn new_incompatible_node_cluster( + id: u64, + count: usize, +) -> Cluster, RocksEngine> { + let pd_client = Arc::new(TestPdClient::new(id, true)); + let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(&crate::create_test_engine), + ) +} diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs new file mode 100644 index 00000000000..35671c227f4 --- /dev/null +++ b/components/test_raftstore-v2/src/server.rs @@ -0,0 +1,1039 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + path::Path, + sync::{Arc, Mutex, RwLock}, + thread, + time::Duration, +}; + +use api_version::{dispatch_api_version, KvFormat}; +use causal_ts::CausalTsProviderImpl; +use collections::{HashMap, HashSet}; +use concurrency_manager::ConcurrencyManager; +use encryption_export::DataKeyManager; +use engine_rocks::RocksEngine; +use engine_test::raft::RaftTestEngine; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use futures::{executor::block_on, Future}; +use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; +use grpcio_health::HealthService; +use kvproto::{ + deadlock_grpc::create_deadlock, + debugpb_grpc::DebugClient, + diagnosticspb_grpc::create_diagnostics, + import_sstpb_grpc::create_import_sst, + kvrpcpb::{ApiVersion, Context}, + metapb, + raft_cmdpb::RaftCmdResponse, + raft_serverpb::RaftMessage, + tikvpb_grpc::TikvClient, +}; +use pd_client::PdClient; +use raftstore::{ + coprocessor::CoprocessorHost, + errors::Error as RaftError, + store::{ + region_meta, AutoSplitController, CheckLeaderRunner, FlowStatsReporter, ReadStats, + RegionSnapshot, TabletSnapManager, WriteStats, + }, + RegionInfoAccessor, +}; +use raftstore_v2::{router::RaftRouter, StateStorage, StoreMeta, StoreRouter}; +use resource_control::ResourceGroupManager; +use resource_metering::{CollectorRegHandle, ResourceTagFactory}; +use security::SecurityManager; +use slog_global::debug; +use tempfile::TempDir; +use test_pd_client::TestPdClient; +use test_raftstore::{filter_send, AddressMap, Config, Filter}; +use tikv::{ + coprocessor, coprocessor_v2, + import::{ImportSstService, SstImporter}, + read_pool::ReadPool, + server::{ + gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, + raftkv::ReplicaReadLockChecker, resolve, service::DiagnosticsService, ConnectionBuilder, + Error, Extension, NodeV2, PdStoreAddrResolver, RaftClient, RaftKv2, Result as ServerResult, + Server, ServerTransport, + }, + storage::{ + self, + kv::{FakeExtension, LocalTablets, RaftExtension, SnapContext}, + txn::flow_controller::{EngineFlowController, FlowController}, + Engine, Storage, + }, +}; +use tikv_util::{ + box_err, + config::VersionTrack, + quota_limiter::QuotaLimiter, + sys::thread::ThreadBuildWrapper, + thd_name, + worker::{Builder as WorkerBuilder, LazyWorker}, + Either, HandyRwLock, +}; +use tokio::runtime::Builder as TokioBuilder; +use txn_types::TxnExtraScheduler; + +use crate::{Cluster, RaftStoreRouter, SimulateTransport, Simulator, SnapshotRouter}; + +#[derive(Clone)] +struct DummyReporter; + +impl FlowStatsReporter for DummyReporter { + fn report_read_stats(&self, _read_stats: ReadStats) {} + fn report_write_stats(&self, _write_stats: WriteStats) {} +} + +type SimulateRaftExtension = as Engine>::RaftExtension; +type SimulateStoreTransport = SimulateTransport>; +type SimulateServerTransport = + SimulateTransport, PdStoreAddrResolver>>; + +pub type SimulateEngine = RaftKv2; + +// TestRaftKvv2 behaves the same way with RaftKv2, except that it has filters +// that can mock various network conditions. +#[derive(Clone)] +pub struct TestRaftKv2 { + raftkv: SimulateEngine, + filters: Arc>>>, +} + +impl TestRaftKv2 { + pub fn new( + raftkv: SimulateEngine, + filters: Arc>>>, + ) -> TestRaftKv2 { + TestRaftKv2 { raftkv, filters } + } + + pub fn set_txn_extra_scheduler(&mut self, txn_extra_scheduler: Arc) { + self.raftkv.set_txn_extra_scheduler(txn_extra_scheduler); + } +} + +impl Engine for TestRaftKv2 { + type Snap = RegionSnapshot; + type Local = EK; + + fn kv_engine(&self) -> Option { + self.raftkv.kv_engine() + } + + type RaftExtension = TestExtension; + fn raft_extension(&self) -> Self::RaftExtension { + TestExtension::new(self.raftkv.raft_extension(), self.filters.clone()) + } + + fn modify_on_kv_engine( + &self, + region_modifies: HashMap>, + ) -> storage::kv::Result<()> { + self.raftkv.modify_on_kv_engine(region_modifies) + } + + type SnapshotRes = as Engine>::SnapshotRes; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { + self.raftkv.async_snapshot(ctx) + } + + type WriteRes = as Engine>::WriteRes; + fn async_write( + &self, + ctx: &Context, + batch: storage::kv::WriteData, + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + self.raftkv.async_write(ctx, batch, subscribed, on_applied) + } + + #[inline] + fn precheck_write_with_ctx(&self, ctx: &Context) -> storage::kv::Result<()> { + self.raftkv.precheck_write_with_ctx(ctx) + } + + #[inline] + fn schedule_txn_extra(&self, txn_extra: txn_types::TxnExtra) { + self.raftkv.schedule_txn_extra(txn_extra) + } +} + +#[derive(Clone)] +pub struct TestExtension { + extension: Extension, + filters: Arc>>>, +} + +impl TestExtension { + pub fn new( + extension: Extension, + filters: Arc>>>, + ) -> Self { + TestExtension { extension, filters } + } +} + +impl RaftExtension for TestExtension { + fn feed(&self, msg: RaftMessage, key_message: bool) { + let send = |msg| -> raftstore::Result<()> { + self.extension.feed(msg, key_message); + Ok(()) + }; + + let _ = filter_send(&self.filters, msg, send); + } + + #[inline] + fn report_reject_message(&self, region_id: u64, from_peer_id: u64) { + self.extension + .report_reject_message(region_id, from_peer_id) + } + + #[inline] + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + self.extension + .report_peer_unreachable(region_id, to_peer_id) + } + + #[inline] + fn report_store_unreachable(&self, store_id: u64) { + self.extension.report_store_unreachable(store_id) + } + + #[inline] + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: raft::SnapshotStatus, + ) { + self.extension + .report_snapshot_status(region_id, to_peer_id, status) + } + + #[inline] + fn report_resolved(&self, store_id: u64, group_id: u64) { + self.extension.report_resolved(store_id, group_id) + } + + #[inline] + fn split( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> futures::future::BoxFuture<'static, storage::kv::Result>> { + self.extension + .split(region_id, region_epoch, split_keys, source) + } + + fn query_region( + &self, + region_id: u64, + ) -> futures::future::BoxFuture<'static, storage::kv::Result> { + self.extension.query_region(region_id) + } +} + +pub struct ServerMeta { + node: NodeV2, + server: Server>, + sim_router: SimulateStoreTransport, + sim_trans: SimulateServerTransport, + raw_router: StoreRouter, + gc_worker: GcWorker>, + rts_worker: Option>, + rsmeter_cleanup: Box, +} + +type PendingServices = Vec Service>>; + +pub struct ServerCluster { + metas: HashMap>, + addrs: AddressMap, + pub storages: HashMap>, + pub region_info_accessors: HashMap, + snap_paths: HashMap, + snap_mgrs: HashMap, + pd_client: Arc, + raft_clients: HashMap>, + conn_builder: ConnectionBuilder, + concurrency_managers: HashMap, + env: Arc, + pub pending_services: HashMap, + pub health_services: HashMap, + pub security_mgr: Arc, + pub txn_extra_schedulers: HashMap>, + pub causal_ts_providers: HashMap>, +} + +impl ServerCluster { + pub fn new(pd_client: Arc) -> Self { + let env = Arc::new( + EnvBuilder::new() + .cq_count(2) + .name_prefix(thd_name!("server-cluster")) + .build(), + ); + let security_mgr = Arc::new(SecurityManager::new(&Default::default()).unwrap()); + let map = AddressMap::default(); + // We don't actually need to handle snapshot message, just create a dead worker + // to make it compile. + let worker = LazyWorker::new("snap-worker"); + let conn_builder = ConnectionBuilder::new( + env.clone(), + Arc::default(), + security_mgr.clone(), + map.clone(), + FakeExtension {}, + worker.scheduler(), + Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), + ); + ServerCluster { + metas: HashMap::default(), + addrs: map, + pd_client, + security_mgr, + storages: HashMap::default(), + region_info_accessors: HashMap::default(), + snap_mgrs: HashMap::default(), + snap_paths: HashMap::default(), + pending_services: HashMap::default(), + health_services: HashMap::default(), + raft_clients: HashMap::default(), + conn_builder, + concurrency_managers: HashMap::default(), + env, + txn_extra_schedulers: HashMap::default(), + causal_ts_providers: HashMap::default(), + } + } + + pub fn get_addr(&self, node_id: u64) -> String { + self.addrs.get(node_id).unwrap() + } + + pub fn run_node_impl( + &mut self, + node_id: u64, + mut cfg: Config, + store_meta: Arc>>, + key_manager: Option>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + resource_manager: &Option>, + ) -> ServerResult { + let (snap_mgr, snap_mgs_path) = if !self.snap_mgrs.contains_key(&node_id) { + let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); + let snap_path = tmp.path().to_str().unwrap().to_owned(); + ( + TabletSnapManager::new(snap_path, key_manager.clone())?, + Some(tmp), + ) + } else { + (self.snap_mgrs[&node_id].clone(), None) + }; + + let bg_worker = WorkerBuilder::new("background").thread_count(2).create(); + + if cfg.server.addr == "127.0.0.1:0" { + // Now we cache the store address, so here we should re-use last + // listening address for the same store. + if let Some(addr) = self.addrs.get(node_id) { + cfg.server.addr = addr; + } else { + cfg.server.addr = format!("127.0.0.1:{}", test_util::alloc_port()); + } + } + + // Create node. + let mut raft_store = cfg.raft_store.clone(); + raft_store + .validate( + cfg.coprocessor.region_split_size(), + cfg.coprocessor.enable_region_bucket(), + cfg.coprocessor.region_bucket_size, + ) + .unwrap(); + + let mut node = NodeV2::new(&cfg.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&raft_store, &raft_engine).unwrap(); + assert_eq!(node.id(), node_id); + + tablet_registry + .tablet_factory() + .set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + node.router().clone(), + ))); + + let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + + let raft_router = + RaftRouter::new_with_store_meta(node.router().clone(), store_meta.clone()); + + // Create coprocessor. + let mut coprocessor_host = + CoprocessorHost::new(raft_router.store_router().clone(), cfg.coprocessor.clone()); + + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let sim_router = SimulateTransport::new(raft_router.clone()); + let mut raft_kv_v2 = TestRaftKv2::new( + RaftKv2::new(raft_router.clone(), region_info_accessor.region_leaders()), + sim_router.filters().clone(), + ); + + // Create storage. + let pd_worker = LazyWorker::new("test-pd-worker"); + let pd_sender = raftstore_v2::PdReporter::new( + pd_worker.scheduler(), + slog_global::borrow_global().new(slog::o!()), + ); + let storage_read_pool = ReadPool::from(storage::build_read_pool( + &tikv::config::StorageReadPoolConfig::default_for_test(), + pd_sender, + raft_kv_v2.clone(), + )); + + if let Some(scheduler) = self.txn_extra_schedulers.remove(&node_id) { + raft_kv_v2.set_txn_extra_scheduler(scheduler); + } + + let latest_ts = + block_on(self.pd_client.get_tso()).expect("failed to get timestamp from PD"); + let concurrency_manager = ConcurrencyManager::new(latest_ts); + + let (tx, _rx) = std::sync::mpsc::channel(); + let mut gc_worker = GcWorker::new( + raft_kv_v2.clone(), + tx, + cfg.gc.clone(), + Default::default(), + Arc::new(region_info_accessor.clone()), + ); + gc_worker.start(node_id).unwrap(); + + let rts_worker = if cfg.resolved_ts.enable { + // Resolved ts worker + let mut rts_worker = LazyWorker::new("resolved-ts"); + let rts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); + rts_ob.register_to(&mut coprocessor_host); + // resolved ts endpoint needs store id. + store_meta.lock().unwrap().store_id = node_id; + // Resolved ts endpoint + let rts_endpoint = resolved_ts::Endpoint::new( + &cfg.resolved_ts, + rts_worker.scheduler(), + raft_router.clone(), + store_meta.clone(), + self.pd_client.clone(), + concurrency_manager.clone(), + self.env.clone(), + self.security_mgr.clone(), + ); + // Start the worker + rts_worker.start(rts_endpoint); + Some(rts_worker) + } else { + None + }; + + if ApiVersion::V2 == F::TAG { + let casual_ts_provider: Arc = Arc::new( + block_on(causal_ts::BatchTsoProvider::new_opt( + self.pd_client.clone(), + cfg.causal_ts.renew_interval.0, + cfg.causal_ts.alloc_ahead_buffer.0, + cfg.causal_ts.renew_batch_min_size, + cfg.causal_ts.renew_batch_max_size, + )) + .unwrap() + .into(), + ); + self.causal_ts_providers.insert(node_id, casual_ts_provider); + } + + // Start resource metering. + let (res_tag_factory, collector_reg_handle, rsmeter_cleanup) = + self.init_resource_metering(&cfg.resource_metering); + + let check_leader_runner = CheckLeaderRunner::new(store_meta, coprocessor_host.clone()); + let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); + + let mut lock_mgr = LockManager::new(&cfg.pessimistic_txn); + let quota_limiter = Arc::new(QuotaLimiter::new( + cfg.quota.foreground_cpu_time, + cfg.quota.foreground_write_bandwidth, + cfg.quota.foreground_read_bandwidth, + cfg.quota.background_cpu_time, + cfg.quota.background_write_bandwidth, + cfg.quota.background_read_bandwidth, + cfg.quota.max_delay_duration, + cfg.quota.enable_auto_tune, + )); + + let casual_ts_provider = self.get_causal_ts_provider(node_id); + let store = Storage::<_, _, F>::from_engine( + raft_kv_v2.clone(), + &cfg.storage, + storage_read_pool.handle(), + lock_mgr.clone(), + concurrency_manager.clone(), + lock_mgr.get_storage_dynamic_configs(), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), + DummyReporter, + res_tag_factory.clone(), + quota_limiter.clone(), + self.pd_client.feature_gate().clone(), + casual_ts_provider.clone(), + resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), + )?; + self.storages.insert(node_id, raft_kv_v2.clone()); + + ReplicaReadLockChecker::new(concurrency_manager.clone()).register(&mut coprocessor_host); + + // Create import service. + let importer = { + let dir = Path::new(raft_engine.get_engine_path()).join("../import-sst"); + Arc::new( + SstImporter::new( + &cfg.import, + dir, + key_manager.clone(), + cfg.storage.api_version(), + ) + .unwrap(), + ) + }; + let import_service = ImportSstService::new( + cfg.import.clone(), + cfg.raft_store.raft_entry_max_size, + raft_kv_v2, + LocalTablets::Registry(tablet_registry.clone()), + Arc::clone(&importer), + ); + + // Create deadlock service. + let deadlock_service = lock_mgr.deadlock_service(); + + // Create pd client, snapshot manager, server. + let (resolver, state) = resolve::new_resolver( + Arc::clone(&self.pd_client), + &bg_worker, + store.get_engine().raft_extension(), + ); + let security_mgr = Arc::new(SecurityManager::new(&cfg.security).unwrap()); + let cop_read_pool = ReadPool::from(coprocessor::readpool_impl::build_read_pool_for_test( + &tikv::config::CoprReadPoolConfig::default_for_test(), + store.get_engine(), + )); + let copr = coprocessor::Endpoint::new( + &server_cfg.value().clone(), + cop_read_pool.handle(), + concurrency_manager.clone(), + res_tag_factory, + quota_limiter, + ); + let copr_v2 = coprocessor_v2::Endpoint::new(&cfg.coprocessor_v2); + let mut server = None; + + // Create Debug service. + let debug_thread_pool = Arc::new( + TokioBuilder::new_multi_thread() + .thread_name(thd_name!("debugger")) + .worker_threads(1) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) + .build() + .unwrap(), + ); + let debug_thread_handle = debug_thread_pool.handle().clone(); + let diag_service = DiagnosticsService::new( + debug_thread_handle, + cfg.log.file.filename.clone(), + cfg.slow_log_file.clone(), + ); + + let health_service = HealthService::default(); + + for _ in 0..100 { + let mut svr = Server::new( + node_id, + &server_cfg, + &security_mgr, + store.clone(), + copr.clone(), + copr_v2.clone(), + resolver.clone(), + Either::Right(snap_mgr.clone()), + gc_worker.clone(), + check_leader_scheduler.clone(), + self.env.clone(), + None, + debug_thread_pool.clone(), + health_service.clone(), + resource_manager.clone(), + ) + .unwrap(); + svr.register_service(create_diagnostics(diag_service.clone())); + svr.register_service(create_deadlock(deadlock_service.clone())); + svr.register_service(create_import_sst(import_service.clone())); + if let Some(svcs) = self.pending_services.get(&node_id) { + for fact in svcs { + svr.register_service(fact()); + } + } + match svr.build_and_bind() { + Ok(_) => { + server = Some(svr); + break; + } + Err(Error::Grpc(GrpcError::BindFail(ref addr, ref port))) => { + // Servers may meet the error, when we restart them. + debug!("fail to create a server: bind fail {:?}", (addr, port)); + thread::sleep(Duration::from_millis(100)); + continue; + } + Err(ref e) => panic!("fail to create a server: {:?}", e), + } + } + let mut server = server.unwrap(); + let addr = server.listening_addr(); + assert_eq!(addr.clone().to_string(), node.store().address); + cfg.server.addr = format!("{}", addr); + let trans = server.transport(); + let simulate_trans = SimulateTransport::new(trans); + let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + + // Register the role change observer of the lock manager. + lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); + + let pessimistic_txn_cfg = cfg.tikv.pessimistic_txn; + node.start( + raft_engine, + tablet_registry.clone(), + &raft_router, + simulate_trans.clone(), + snap_mgr.clone(), + concurrency_manager.clone(), + casual_ts_provider, + coprocessor_host, + AutoSplitController::default(), + collector_reg_handle, + bg_worker, + pd_worker, + Arc::new(VersionTrack::new(raft_store)), + &state, + importer, + key_manager, + )?; + assert!(node_id == 0 || node_id == node.id()); + let node_id = node.id(); + self.snap_mgrs.insert(node_id, snap_mgr); + if let Some(tmp) = snap_mgs_path { + self.snap_paths.insert(node_id, tmp); + } + self.region_info_accessors + .insert(node_id, region_info_accessor); + // todo: importer + self.health_services.insert(node_id, health_service); + + lock_mgr + .start( + node.id(), + Arc::clone(&self.pd_client), + resolver, + Arc::clone(&security_mgr), + &pessimistic_txn_cfg, + ) + .unwrap(); + + server + .start(server_cfg, security_mgr, tablet_registry) + .unwrap(); + + self.metas.insert( + node_id, + ServerMeta { + raw_router: raft_router.store_router().clone(), + node, + server, + sim_router, + gc_worker, + sim_trans: simulate_trans, + rts_worker, + rsmeter_cleanup, + }, + ); + self.addrs.insert(node_id, format!("{}", addr)); + self.concurrency_managers + .insert(node_id, concurrency_manager); + + let client = RaftClient::new(node_id, self.conn_builder.clone()); + self.raft_clients.insert(node_id, client); + Ok(node_id) + } + + pub fn get_gc_worker(&self, node_id: u64) -> &GcWorker> { + &self.metas.get(&node_id).unwrap().gc_worker + } + + pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { + self.causal_ts_providers.get(&node_id).cloned() + } + + fn init_resource_metering( + &self, + cfg: &resource_metering::Config, + ) -> (ResourceTagFactory, CollectorRegHandle, Box) { + let (_, collector_reg_handle, resource_tag_factory, recorder_worker) = + resource_metering::init_recorder(cfg.precision.as_millis()); + let (_, data_sink_reg_handle, reporter_worker) = + resource_metering::init_reporter(cfg.clone(), collector_reg_handle.clone()); + let (_, single_target_worker) = resource_metering::init_single_target( + cfg.receiver_address.clone(), + Arc::new(Environment::new(2)), + data_sink_reg_handle, + ); + + ( + resource_tag_factory, + collector_reg_handle, + Box::new(move || { + single_target_worker.stop_worker(); + reporter_worker.stop_worker(); + recorder_worker.stop_worker(); + }), + ) + } + + pub fn get_concurrency_manager(&self, node_id: u64) -> ConcurrencyManager { + self.concurrency_managers.get(&node_id).unwrap().clone() + } +} + +impl Simulator for ServerCluster { + fn get_node_ids(&self) -> HashSet { + self.metas.keys().cloned().collect() + } + + fn add_send_filter(&mut self, node_id: u64, filter: Box) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_trans + .add_filter(filter); + } + + fn clear_send_filters(&mut self, node_id: u64) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_trans + .clear_filters(); + } + + fn add_recv_filter(&mut self, node_id: u64, filter: Box) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_router + .add_filter(filter); + } + + fn clear_recv_filters(&mut self, node_id: u64) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_router + .clear_filters(); + } + + fn run_node( + &mut self, + node_id: u64, + cfg: Config, + store_meta: Arc>>, + key_manager: Option>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + resource_manager: &Option>, + ) -> ServerResult { + dispatch_api_version!( + cfg.storage.api_version(), + self.run_node_impl::( + node_id, + cfg, + store_meta, + key_manager, + raft_engine, + tablet_registry, + resource_manager + ) + ) + } + + fn stop_node(&mut self, node_id: u64) { + if let Some(mut meta) = self.metas.remove(&node_id) { + meta.server.stop().unwrap(); + meta.node.stop(); + // resolved ts worker started, let's stop it + if let Some(worker) = meta.rts_worker { + worker.stop_worker(); + } + (meta.rsmeter_cleanup)(); + } + self.storages.remove(&node_id); + let _ = self.raft_clients.remove(&node_id); + } + + fn async_snapshot( + &mut self, + request: kvproto::raft_cmdpb::RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send + { + let node_id = request.get_header().get_peer().get_store_id(); + let mut router = match self.metas.get(&node_id) { + None => { + let mut resp = RaftCmdResponse::default(); + let e: RaftError = box_err!("missing sender for store {}", node_id); + resp.mut_header().set_error(e.into()); + // return async move {Err(resp)}; + unreachable!() + } + Some(meta) => meta.sim_router.clone(), + }; + + router.snapshot(request) + } + + fn async_peer_msg_on_node( + &self, + node_id: u64, + region_id: u64, + msg: raftstore_v2::router::PeerMsg, + ) -> raftstore::Result<()> { + let router = match self.metas.get(&node_id) { + None => return Err(box_err!("missing sender for store {}", node_id)), + Some(meta) => meta.sim_router.clone(), + }; + + router.send_peer_msg(region_id, msg) + } + + fn send_raft_msg(&mut self, msg: RaftMessage) -> raftstore::Result<()> { + let from_store = msg.get_from_peer().store_id; + assert_ne!(from_store, 0); + if let Some(client) = self.raft_clients.get_mut(&from_store) { + client.send(msg).unwrap(); + client.flush(); + } + Ok(()) + } + + fn get_router(&self, node_id: u64) -> Option> { + self.metas.get(&node_id).map(|m| m.raw_router.clone()) + } + + fn get_snap_dir(&self, node_id: u64) -> String { + self.snap_mgrs[&node_id] + .root_path() + .to_str() + .unwrap() + .to_owned() + } + + fn get_snap_mgr(&self, node_id: u64) -> &TabletSnapManager { + self.snap_mgrs.get(&node_id).unwrap() + } +} + +impl Cluster, EK> { + pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { + let mut try_snapshot = || -> Option> { + let leader = self.leader_of_region(region_id)?; + let store_id = leader.store_id; + let epoch = self.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + + let mut storage = self.sim.rl().storages.get(&store_id).unwrap().clone(); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + storage.snapshot(snap_ctx).ok() + }; + for _ in 0..10 { + if let Some(snapshot) = try_snapshot() { + return snapshot; + } + thread::sleep(Duration::from_millis(200)); + } + panic!("failed to get snapshot of region {}", region_id); + } + + pub fn get_addr(&self, node_id: u64) -> String { + self.sim.rl().get_addr(node_id) + } + + pub fn get_security_mgr(&self) -> Arc { + self.sim.rl().security_mgr.clone() + } +} + +pub fn new_server_cluster( + id: u64, + count: usize, +) -> Cluster, RocksEngine> { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(crate::create_test_engine), + ) +} + +pub fn new_incompatible_server_cluster( + id: u64, + count: usize, +) -> Cluster, RocksEngine> { + let pd_client = Arc::new(TestPdClient::new(id, true)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(crate::create_test_engine), + ) +} + +pub fn new_server_cluster_with_api_ver( + id: u64, + count: usize, + api_ver: ApiVersion, +) -> Cluster, RocksEngine> { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new( + id, + count, + sim, + pd_client, + api_ver, + Box::new(crate::create_test_engine), + ) +} + +pub fn must_new_cluster_and_kv_client() -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { + must_new_cluster_and_kv_client_mul(1) +} + +pub fn must_new_cluster_and_kv_client_mul( + count: usize, +) -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { + let (cluster, leader, ctx) = must_new_cluster_mul(count); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + (cluster, client, ctx) +} +pub fn must_new_cluster_mul( + count: usize, +) -> ( + Cluster, RocksEngine>, + metapb::Peer, + Context, +) { + must_new_and_configure_cluster_mul(count, |_| ()) +} + +fn must_new_and_configure_cluster_mul( + count: usize, + mut configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + metapb::Peer, + Context, +) { + let mut cluster = new_server_cluster(0, count); + configure(&mut cluster); + cluster.run(); + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader.clone()); + ctx.set_region_epoch(epoch); + + (cluster, leader, ctx) +} + +pub fn must_new_and_configure_cluster_and_kv_client( + configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { + let (cluster, leader, ctx) = must_new_and_configure_cluster(configure); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + (cluster, client, ctx) +} + +pub fn must_new_and_configure_cluster( + configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + metapb::Peer, + Context, +) { + must_new_and_configure_cluster_mul(1, configure) +} + +pub fn must_new_cluster_and_debug_client() -> ( + Cluster, RocksEngine>, + DebugClient, + u64, +) { + let (cluster, leader, _) = must_new_cluster_mul(1); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = DebugClient::new(channel); + + (cluster, client, leader.get_store_id()) +} diff --git a/components/test_raftstore-v2/src/transport_simulate.rs b/components/test_raftstore-v2/src/transport_simulate.rs new file mode 100644 index 00000000000..7b9333aae83 --- /dev/null +++ b/components/test_raftstore-v2/src/transport_simulate.rs @@ -0,0 +1,148 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{Arc, RwLock}; + +use engine_traits::{KvEngine, RaftEngine}; +use futures::Future; +use kvproto::{ + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use raft::SnapshotStatus; +use raftstore::{ + router::handle_send_error, + store::{RegionSnapshot, Transport}, + Result, Result as RaftStoreResult, +}; +use raftstore_v2::router::{PeerMsg, RaftRouter}; +use test_raftstore::{filter_send, Filter}; +use tikv_util::HandyRwLock; + +#[derive(Clone)] +pub struct SimulateTransport { + filters: Arc>>>, + ch: C, +} + +impl SimulateTransport { + pub fn new(ch: C) -> SimulateTransport { + Self { + filters: Arc::new(RwLock::new(vec![])), + ch, + } + } + + pub fn clear_filters(&mut self) { + self.filters.wl().clear(); + } + + pub fn add_filter(&mut self, filter: Box) { + self.filters.wl().push(filter); + } + + pub fn filters(&self) -> &Arc>>> { + &self.filters + } +} + +impl Transport for SimulateTransport { + fn send(&mut self, m: RaftMessage) -> Result<()> { + let ch = &mut self.ch; + filter_send(&self.filters, m, |m| ch.send(m)) + } + + fn set_store_allowlist(&mut self, allowlist: Vec) { + self.ch.set_store_allowlist(allowlist); + } + + fn need_flush(&self) -> bool { + self.ch.need_flush() + } + + fn flush(&mut self) { + self.ch.flush(); + } +} + +pub trait SnapshotRouter { + fn snapshot( + &mut self, + req: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send; +} + +impl SnapshotRouter for RaftRouter { + fn snapshot( + &mut self, + req: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send + { + self.snapshot(req) + } +} + +impl> SnapshotRouter for SimulateTransport { + fn snapshot( + &mut self, + req: RaftCmdRequest, + ) -> impl Future, RaftCmdResponse>> + Send + { + self.ch.snapshot(req) + } +} + +pub trait RaftStoreRouter { + fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> Result<()>; + + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()>; + + /// Reports the sending snapshot status to the peer of the Region. + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: SnapshotStatus, + ) -> RaftStoreResult<()>; +} + +impl RaftStoreRouter for RaftRouter { + fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> RaftStoreResult<()> { + self.send(region_id, msg) + .map_err(|e| handle_send_error(region_id, e)) + } + + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + let region_id = msg.get_region_id(); + self.send_raft_message(Box::new(msg)) + .map_err(|e| handle_send_error(region_id, e)) + } + + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: SnapshotStatus, + ) -> RaftStoreResult<()> { + self.send_peer_msg(region_id, PeerMsg::SnapshotSent { to_peer_id, status }) + } +} + +impl RaftStoreRouter for SimulateTransport { + fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> RaftStoreResult<()> { + self.ch.send_peer_msg(region_id, msg) + } + + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + filter_send(&self.filters, msg, |m| self.ch.send_raft_msg(m)) + } + + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: SnapshotStatus, + ) -> RaftStoreResult<()> { + self.ch + .report_snapshot_status(region_id, to_peer_id, status) + } +} diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs new file mode 100644 index 00000000000..b9e6464c5d8 --- /dev/null +++ b/components/test_raftstore-v2/src/util.rs @@ -0,0 +1,235 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fmt::Write, sync::Arc, thread, time::Duration}; + +use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use engine_rocks::{RocksEngine, RocksStatistics}; +use engine_test::raft::RaftTestEngine; +use engine_traits::{KvEngine, TabletRegistry, CF_DEFAULT}; +use file_system::IoRateLimiter; +use futures::Future; +use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb::RaftCmdResponse}; +use raftstore::Result; +use rand::RngCore; +use server::common::ConfiguredRaftEngine; +use tempfile::TempDir; +use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, Config}; +use tikv::{ + server::KvEngineFactoryBuilder, + storage::{ + config::EngineType, + kv::{SnapContext, SnapshotExt}, + Engine, Snapshot, + }, +}; +use tikv_util::{config::ReadableDuration, worker::LazyWorker, HandyRwLock}; + +use crate::{bootstrap_store, cluster::Cluster, ServerCluster, Simulator}; + +pub fn create_test_engine( + // TODO: pass it in for all cases. + id: Option<(u64, u64)>, + limiter: Option>, + cfg: &Config, +) -> ( + TabletRegistry, + RaftTestEngine, + Option>, + TempDir, + LazyWorker, + Arc, + Option>, +) { + let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); + let mut cfg = cfg.clone(); + cfg.storage.data_dir = dir.path().to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + let key_manager = + data_key_manager_from_config(&cfg.security.encryption, dir.path().to_str().unwrap()) + .unwrap() + .map(Arc::new); + let cache = cfg + .storage + .block_cache + .build_shared_cache(EngineType::RaftKv2); + let env = cfg + .build_shared_rocks_env(key_manager.clone(), limiter) + .unwrap(); + + let sst_worker = LazyWorker::new("sst-recovery"); + let scheduler = sst_worker.scheduler(); + + let (raft_engine, raft_statistics) = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); + + if let Some((cluster_id, store_id)) = id { + assert_ne!(store_id, 0); + bootstrap_store(&raft_engine, cluster_id, store_id).unwrap(); + } + + let builder = + KvEngineFactoryBuilder::new(env, &cfg.tikv, cache).sst_recovery_sender(Some(scheduler)); + + let factory = Box::new(builder.build()); + let rocks_statistics = factory.rocks_statistics(); + let reg = TabletRegistry::new(factory, dir.path().join("tablet")).unwrap(); + + ( + reg, + raft_engine, + key_manager, + dir, + sst_worker, + rocks_statistics, + raft_statistics, + ) +} + +/// Keep putting random kvs until specified size limit is reached. +pub fn put_till_size, EK: KvEngine>( + cluster: &mut Cluster, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + put_cf_till_size(cluster, CF_DEFAULT, limit, range) +} + +pub fn put_cf_till_size, EK: KvEngine>( + cluster: &mut Cluster, + cf: &'static str, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + assert!(limit > 0); + let mut len = 0; + let mut rng = rand::thread_rng(); + let mut key = String::new(); + let mut value = vec![0; 64]; + while len < limit { + let batch_size = std::cmp::min(1024, limit - len); + let mut reqs = vec![]; + for _ in 0..batch_size / 74 + 1 { + key.clear(); + let key_id = range.next().unwrap(); + write!(key, "{:09}", key_id).unwrap(); + rng.fill_bytes(&mut value); + // plus 1 for the extra encoding prefix + len += key.len() as u64 + 1; + len += value.len() as u64; + reqs.push(new_put_cf_cmd(cf, key.as_bytes(), &value)); + } + cluster.batch_put(key.as_bytes(), reqs).unwrap(); + // Approximate size of memtable is inaccurate for small data, + // we flush it to SST so we can use the size properties instead. + cluster.must_flush_cf(cf, true); + } + key.into_bytes() +} + +pub fn configure_for_snapshot(config: &mut Config) { + // Truncate the log quickly so that we can force sending snapshot. + config.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + config.raft_store.raft_log_gc_count_limit = Some(2); + config.raft_store.merge_max_log_gap = 1; + config.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); +} + +pub fn configure_for_lease_read_v2, EK: KvEngine>( + cluster: &mut Cluster, + base_tick_ms: Option, + election_ticks: Option, +) -> Duration { + if let Some(base_tick_ms) = base_tick_ms { + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + } + let base_tick_interval = cluster.cfg.raft_store.raft_base_tick_interval.0; + if let Some(election_ticks) = election_ticks { + cluster.cfg.raft_store.raft_election_timeout_ticks = election_ticks; + } + let election_ticks = cluster.cfg.raft_store.raft_election_timeout_ticks as u32; + let election_timeout = base_tick_interval * election_ticks; + // Adjust max leader lease. + cluster.cfg.raft_store.raft_store_max_leader_lease = + ReadableDuration(election_timeout - base_tick_interval); + // Use large peer check interval, abnormal and max leader missing duration to + // make a valid config, that is election timeout x 2 < peer stale state + // check < abnormal < max leader missing duration. + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); + cluster.cfg.raft_store.abnormal_leader_missing_duration = + ReadableDuration(election_timeout * 4); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration(election_timeout * 5); + + election_timeout +} + +pub fn wait_for_synced( + cluster: &mut Cluster, RocksEngine>, + node_id: u64, + region_id: u64, +) { + let mut storage = cluster + .sim + .read() + .unwrap() + .storages + .get(&node_id) + .unwrap() + .clone(); + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + let snapshot = storage.snapshot(snap_ctx).unwrap(); + let txn_ext = snapshot.txn_ext.clone().unwrap(); + for retry in 0..10 { + if txn_ext.is_max_ts_synced() { + break; + } + thread::sleep(Duration::from_millis(1 << retry)); + } + assert!(snapshot.ext().is_max_ts_synced()); +} + +// Issue a read request on the specified peer. +pub fn read_on_peer, EK: KvEngine>( + cluster: &mut Cluster, + peer: metapb::Peer, + region: metapb::Region, + key: &[u8], + read_quorum: bool, + timeout: Duration, +) -> Result { + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(key)], + read_quorum, + ); + request.mut_header().set_peer(peer); + cluster.read(None, request, timeout) +} + +pub fn async_read_on_peer, EK: KvEngine>( + cluster: &mut Cluster, + peer: metapb::Peer, + region: metapb::Region, + key: &[u8], + read_quorum: bool, + replica_read: bool, +) -> impl Future> { + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(key)], + read_quorum, + ); + request.mut_header().set_peer(peer); + request.mut_header().set_replica_read(replica_read); + cluster.sim.wl().async_read(request) +} diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index cd9df2e3c05..1b87aeac11b 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -23,42 +23,44 @@ test-engines-panic = [ ] [dependencies] -api_version = { path = "../api_version" } +api_version = { workspace = true } backtrace = "0.3" -causal_ts = { path = "../causal_ts" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +causal_ts = { workspace = true, features = ["testexport"] } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -encryption_export = { path = "../encryption/export", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_rocks_helper = { path = "../engine_rocks_helper" } -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_rocks_helper = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +keys = { workspace = true } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false, features = ["testexport"] } +raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" -resolved_ts = { path = "../resolved_ts" } -resource_metering = { path = "../resource_metering" } -security = { path = "../security", default-features = false } -server = { path = "../server" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +resolved_ts = { workspace = true } +resource_control = { workspace = true } +resource_metering = { workspace = true } +security = { workspace = true } +server = { workspace = true } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tempfile = "3.0" -test_util = { path = "../test_util", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +test_pd_client = { workspace = true } +test_util = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -txn_types = { path = "../txn_types", default-features = false } +tokio-timer = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 63c7e3023c3..988625d3750 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -12,17 +12,17 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{raw::DB, Compat, RocksEngine, RocksSnapshot}; +use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, WriteBatch, - WriteBatchExt, CF_DEFAULT, CF_RAFT, + CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, SyncMutable, + WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; -use file_system::IORateLimiter; -use futures::executor::block_on; +use file_system::IoRateLimiter; +use futures::{self, channel::oneshot, executor::block_on}; use kvproto::{ errorpb::Error as PbError, - kvrpcpb::{ApiVersion, Context}, + kvrpcpb::{ApiVersion, Context, DiskFullOpt}, metapb::{self, Buckets, PeerRole, RegionEpoch, StoreLabel}, pdpb::{self, CheckPolicy, StoreReport}, raft_cmdpb::*, @@ -34,6 +34,7 @@ use kvproto::{ use pd_client::{BucketStat, PdClient}; use raft::eraftpb::ConfChangeType; use raftstore::{ + router::RaftStoreRouter, store::{ fsm::{ create_raft_batch_system, @@ -45,7 +46,9 @@ use raftstore::{ }, Error, Result, }; +use resource_control::ResourceGroupManager; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; use tikv_util::{ thread_group::GroupProperties, @@ -53,6 +56,7 @@ use tikv_util::{ worker::LazyWorker, HandyRwLock, }; +use txn_types::WriteBatchFlags; use super::*; use crate::Config; @@ -77,6 +81,7 @@ pub trait Simulator { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Option>, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; @@ -110,7 +115,7 @@ pub trait Simulator { } fn read( - &self, + &mut self, batch_id: Option, request: RaftCmdRequest, timeout: Duration, @@ -123,7 +128,7 @@ pub trait Simulator { } fn async_read( - &self, + &mut self, node_id: u64, batch_id: Option, request: RaftCmdRequest, @@ -160,15 +165,18 @@ pub struct Cluster { pub dbs: Vec>, pub store_metas: HashMap>>, key_managers: Vec>>, - pub io_rate_limiter: Option>, + pub io_rate_limiter: Option>, pub engines: HashMap>, key_managers_map: HashMap>>, pub labels: HashMap>, group_props: HashMap, pub sst_workers: Vec>, pub sst_workers_map: HashMap, + pub kv_statistics: Vec>, + pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, + resource_manager: Option>, } impl Cluster { @@ -180,7 +188,8 @@ impl Cluster { pd_client: Arc, api_version: ApiVersion, ) -> Cluster { - // TODO: In the future, maybe it's better to test both case where `use_delete_range` is true and false + // TODO: In the future, maybe it's better to test both case where + // `use_delete_range` is true and false Cluster { cfg: Config { tikv: new_tikv_config_with_api_ver(id, api_version), @@ -201,6 +210,9 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), + resource_manager: Some(Arc::new(ResourceGroupManager::default())), + kv_statistics: vec![], + raft_statistics: vec![], } } @@ -221,11 +233,12 @@ impl Cluster { Ok(()) } - /// Engines in a just created cluster are not bootstraped, which means they are not associated - /// with a `node_id`. Call `Cluster::start` can bootstrap all nodes in the cluster. + /// Engines in a just created cluster are not bootstrapped, which means they + /// are not associated with a `node_id`. Call `Cluster::start` can bootstrap + /// all nodes in the cluster. /// - /// However sometimes a node can be bootstrapped externally. This function can be called to - /// mark them as bootstrapped in `Cluster`. + /// However sometimes a node can be bootstrapped externally. This function + /// can be called to mark them as bootstrapped in `Cluster`. pub fn set_bootstrapped(&mut self, node_id: u64, offset: usize) { let engines = self.dbs[offset].clone(); let key_mgr = self.key_managers[offset].clone(); @@ -235,12 +248,14 @@ impl Cluster { } fn create_engine(&mut self, router: Option>) { - let (engines, key_manager, dir, sst_worker) = + let (engines, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = create_test_engine(router, self.io_rate_limiter.clone(), &self.cfg); self.dbs.push(engines); self.key_managers.push(key_manager); self.paths.push(dir); self.sst_workers.push(sst_worker); + self.kv_statistics.push(kv_statistics); + self.raft_statistics.push(raft_statistics); } pub fn create_engines(&mut self) { @@ -248,7 +263,7 @@ impl Cluster { self.cfg .storage .io_rate_limit - .build(true /*enable_statistics*/), + .build(true /* enable_statistics */), )); for _ in 0..self.count { self.create_engine(None); @@ -264,7 +279,8 @@ impl Cluster { // Try start new nodes. for _ in 0..self.count - self.engines.len() { - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); self.create_engine(Some(router.clone())); let engines = self.dbs.last().unwrap().clone(); @@ -283,6 +299,7 @@ impl Cluster { key_mgr.clone(), router, system, + &self.resource_manager, )?; self.group_props.insert(node_id, props); self.engines.insert(node_id, engines); @@ -297,14 +314,15 @@ impl Cluster { pub fn compact_data(&self) { for engine in self.engines.values() { let db = &engine.kv; - db.compact_range(CF_DEFAULT, None, None, false, 1).unwrap(); + db.compact_range_cf(CF_DEFAULT, None, None, false, 1) + .unwrap(); } } pub fn flush_data(&self) { for engine in self.engines.values() { let db = &engine.kv; - db.flush_cf(CF_DEFAULT, true /*sync*/).unwrap(); + db.flush_cf(CF_DEFAULT, true /* sync */).unwrap(); } } @@ -333,7 +351,8 @@ impl Cluster { debug!("starting node {}", node_id); let engines = self.engines[&node_id].clone(); let key_mgr = self.key_managers_map[&node_id].clone(); - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); let mut cfg = self.cfg.clone(); if let Some(labels) = self.labels.get(&node_id) { cfg.server.labels = labels.to_owned(); @@ -353,9 +372,16 @@ impl Cluster { tikv_util::thread_group::set_properties(Some(props)); debug!("calling run node"; "node_id" => node_id); // FIXME: rocksdb event listeners may not work, because we change the router. - self.sim - .wl() - .run_node(node_id, cfg, engines, store_meta, key_mgr, router, system)?; + self.sim.wl().run_node( + node_id, + cfg, + engines, + store_meta, + key_mgr, + router, + system, + &self.resource_manager, + )?; debug!("node {} started", node_id); Ok(()) } @@ -363,16 +389,21 @@ impl Cluster { pub fn stop_node(&mut self, node_id: u64) { debug!("stopping node {}", node_id); self.group_props[&node_id].mark_shutdown(); + // Simulate shutdown behavior of server shutdown. It's not enough to just set + // the map above as current thread may also query properties during shutdown. + let previous_prop = tikv_util::thread_group::current_properties(); + tikv_util::thread_group::set_properties(Some(self.group_props[&node_id].clone())); match self.sim.write() { Ok(mut sim) => sim.stop_node(node_id), Err(_) => safe_panic!("failed to acquire write lock."), } self.pd_client.shutdown_store(node_id); debug!("node {} stopped", node_id); + tikv_util::thread_group::set_properties(previous_prop); } - pub fn get_engine(&self, node_id: u64) -> Arc { - Arc::clone(self.engines[&node_id].kv.as_inner()) + pub fn get_engine(&self, node_id: u64) -> RocksEngine { + self.engines[&node_id].kv.clone() } pub fn get_raft_engine(&self, node_id: u64) -> RaftTestEngine { @@ -412,7 +443,7 @@ impl Cluster { request: RaftCmdRequest, timeout: Duration, ) -> Result { - match self.sim.rl().read(batch_id, request.clone(), timeout) { + match self.sim.wl().read(batch_id, request.clone(), timeout) { Err(e) => { warn!("failed to read {:?}: {:?}", request, e); Err(e) @@ -436,7 +467,7 @@ impl Cluster { } } let ret = if is_read { - self.sim.rl().read(None, request.clone(), timeout) + self.sim.wl().read(None, request.clone(), timeout) } else { self.sim.rl().call_command(request.clone(), timeout) }; @@ -605,9 +636,9 @@ impl Cluster { assert_eq!(self.pd_client.get_regions_number() as u32, len) } - // For test when a node is already bootstraped the cluster with the first region - // But another node may request bootstrap at same time and get is_bootstrap false - // Add Region but not set bootstrap to true + // For test when a node is already bootstrapped the cluster with the first + // region But another node may request bootstrap at same time and get + // is_bootstrap false Add Region but not set bootstrap to true pub fn add_first_region(&self) -> Result<()> { let mut region = metapb::Region::default(); let region_id = self.pd_client.alloc_id().unwrap(); @@ -736,14 +767,14 @@ impl Cluster { self.leaders.remove(®ion_id); } - pub fn assert_quorum) -> bool>(&self, mut condition: F) { + pub fn assert_quorum bool>(&self, mut condition: F) { if self.engines.is_empty() { return; } let half = self.engines.len() / 2; let mut qualified_cnt = 0; for (id, engines) in &self.engines { - if !condition(engines.kv.as_inner()) { + if !condition(&engines.kv) { debug!("store {} is not qualified yet.", id); continue; } @@ -1134,6 +1165,23 @@ impl Cluster { } } + pub fn wait_applied_index(&mut self, region_id: u64, store_id: u64, index: u64) { + let timer = Instant::now(); + loop { + let applied_index = self.apply_state(region_id, store_id).applied_index; + if applied_index >= index { + return; + } + if timer.saturating_elapsed() >= Duration::from_secs(5) { + panic!( + "[region {}] log is still not applied to {}: {} on store {}", + region_id, index, applied_index, store_id, + ); + } + thread::sleep(Duration::from_millis(10)); + } + } + pub fn wait_tombstone(&self, region_id: u64, peer: metapb::Peer, check_exist: bool) { let timer = Instant::now(); let mut state; @@ -1178,10 +1226,9 @@ impl Cluster { pub fn apply_state(&self, region_id: u64, store_id: u64) -> RaftApplyState { let key = keys::apply_state_key(region_id); self.get_engine(store_id) - .c() .get_msg_cf::(engine_traits::CF_RAFT, &key) .unwrap() - .unwrap() + .unwrap_or_default() } pub fn get_raft_local_state(&self, region_id: u64, store_id: u64) -> Option { @@ -1197,7 +1244,6 @@ impl Cluster { pub fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { self.get_engine(store_id) - .c() .get_msg_cf::( engine_traits::CF_RAFT, &keys::region_state_key(region_id), @@ -1210,7 +1256,6 @@ impl Cluster { for _ in 0..100 { let state = self .get_engine(store_id) - .c() .get_msg_cf::( engine_traits::CF_RAFT, &keys::region_state_key(region_id), @@ -1260,12 +1305,12 @@ impl Cluster { let mut kv_wb = self.engines[&store_id].kv.write_batch(); self.engines[&store_id] .kv - .scan_cf(CF_RAFT, &meta_start, &meta_end, false, |k, _| { + .scan(CF_RAFT, &meta_start, &meta_end, false, |k, _| { kv_wb.delete(k).unwrap(); Ok(true) }) .unwrap(); - snap.scan_cf(CF_RAFT, &meta_start, &meta_end, false, |k, v| { + snap.scan(CF_RAFT, &meta_start, &meta_end, false, |k, v| { kv_wb.put(k, v).unwrap(); Ok(true) }) @@ -1277,12 +1322,12 @@ impl Cluster { ); self.engines[&store_id] .kv - .scan_cf(CF_RAFT, &raft_start, &raft_end, false, |k, _| { + .scan(CF_RAFT, &raft_start, &raft_end, false, |k, _| { kv_wb.delete(k).unwrap(); Ok(true) }) .unwrap(); - snap.scan_cf(CF_RAFT, &raft_start, &raft_end, false, |k, v| { + snap.scan(CF_RAFT, &raft_start, &raft_end, false, |k, v| { kv_wb.put(k, v).unwrap(); Ok(true) }) @@ -1290,6 +1335,18 @@ impl Cluster { kv_wb.write().unwrap(); } + pub fn add_send_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_send_filter(node_id, filter); + } + + pub fn clear_send_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_send_filters(node_id); + } + + pub fn add_recv_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_recv_filter(node_id, filter); + } + pub fn add_send_filter(&self, factory: F) { let mut sim = self.sim.wl(); for node_id in sim.get_node_ids() { @@ -1299,6 +1356,10 @@ impl Cluster { } } + pub fn clear_recv_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_recv_filters(node_id); + } + pub fn transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { let epoch = self.get_region_epoch(region_id); let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); @@ -1335,6 +1396,13 @@ impl Cluster { } } + pub fn try_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) -> RaftCmdResponse { + let epoch = self.get_region_epoch(region_id); + let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); + self.call_command_on_leader(transfer_leader, Duration::from_secs(5)) + .unwrap() + } + pub fn get_snap_dir(&self, node_id: u64) -> String { self.sim.rl().get_snap_dir(node_id) } @@ -1350,8 +1418,8 @@ impl Cluster { } } - // It's similar to `ask_split`, the difference is the msg, it sends, is `Msg::SplitRegion`, - // and `region` will not be embedded to that msg. + // It's similar to `ask_split`, the difference is the msg, it sends, is + // `Msg::SplitRegion`, and `region` will not be embedded to that msg. // Caller must ensure that the `split_key` is in the `region`. pub fn split_region( &mut self, @@ -1412,6 +1480,82 @@ impl Cluster { .unwrap(); } + pub fn must_send_flashback_msg( + &mut self, + region_id: u64, + cmd_type: AdminCmdType, + cb: Callback, + ) { + let leader = self.leader_of_region(region_id).unwrap(); + let store_id = leader.get_store_id(); + let region_epoch = self.get_region_epoch(region_id); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(cmd_type); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(region_epoch); + req.mut_header().set_peer(leader); + req.set_admin_request(admin); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let router = self.sim.rl().get_router(store_id).unwrap(); + if let Err(e) = router.send_command( + req, + cb, + RaftCmdExtraOpts { + deadline: None, + disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, + }, + ) { + panic!( + "router send flashback msg {:?} failed, error: {}", + cmd_type, e + ); + } + } + + pub fn must_send_wait_flashback_msg(&mut self, region_id: u64, cmd_type: AdminCmdType) { + self.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + let (result_tx, result_rx) = oneshot::channel(); + self.must_send_flashback_msg( + region_id, + cmd_type, + Callback::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + result_tx + .send(Some(resp.response.get_header().get_error().clone())) + .unwrap(); + return; + } + result_tx.send(None).unwrap(); + })), + ); + if let Some(e) = block_on(result_rx).unwrap() { + panic!("call flashback msg {:?} failed, error: {:?}", cmd_type, e); + } + } + + pub fn wait_applied_to_current_term(&mut self, region_id: u64, timeout: Duration) { + let mut now = Instant::now(); + let deadline = now + timeout; + while now < deadline { + if let Some(leader) = self.leader_of_region(region_id) { + let raft_apply_state = self.apply_state(region_id, leader.get_store_id()); + let raft_local_state = self.raft_local_state(region_id, leader.get_store_id()); + // If term matches and apply to commit index, then it must apply to current + // term. + if raft_apply_state.applied_index == raft_apply_state.commit_index + && raft_apply_state.commit_term == raft_local_state.get_hard_state().get_term() + { + return; + } + } + thread::sleep(Duration::from_millis(10)); + now = Instant::now(); + } + panic!("region {} is not applied to current term", region_id,); + } + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { let mut try_cnt = 0; let split_count = self.pd_client.get_split_count(); @@ -1694,6 +1838,10 @@ impl Cluster { ctx } + pub fn get_router(&self, node_id: u64) -> Option> { + self.sim.rl().get_router(node_id) + } + pub fn refresh_region_bucket_keys( &mut self, region: &metapb::Region, @@ -1761,6 +1909,8 @@ impl Cluster { region.get_id(), CasualMessage::HalfSplitRegion { region_epoch: region.get_region_epoch().clone(), + start_key: None, + end_key: None, policy: CheckPolicy::Scan, source: "test", cb, @@ -1769,6 +1919,25 @@ impl Cluster { .unwrap(); rx.recv_timeout(Duration::from_secs(5)).unwrap(); } + + pub fn scan( + &self, + store_id: u64, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + f: F, + ) -> engine_traits::Result<()> + where + F: FnMut(&[u8], &[u8]) -> engine_traits::Result, + { + self.engines[&store_id] + .kv + .scan(cf, start_key, end_key, fill_cache, f)?; + + Ok(()) + } } impl Drop for Cluster { @@ -1777,3 +1946,31 @@ impl Drop for Cluster { self.shutdown(); } } + +pub trait RawEngine: + Peekable + SyncMutable +{ + fn region_local_state(&self, region_id: u64) + -> engine_traits::Result>; + + fn raft_apply_state(&self, _region_id: u64) -> engine_traits::Result>; + + fn raft_local_state(&self, _region_id: u64) -> engine_traits::Result>; +} + +impl RawEngine for RocksEngine { + fn region_local_state( + &self, + region_id: u64, + ) -> engine_traits::Result> { + self.get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) + } + + fn raft_apply_state(&self, region_id: u64) -> engine_traits::Result> { + self.get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) + } + + fn raft_local_state(&self, region_id: u64) -> engine_traits::Result> { + self.get_msg_cf(CF_RAFT, &keys::raft_state_key(region_id)) + } +} diff --git a/components/test_raftstore/src/common-test.toml b/components/test_raftstore/src/common-test.toml index 6b179081def..334291f7213 100644 --- a/components/test_raftstore/src/common-test.toml +++ b/components/test_raftstore/src/common-test.toml @@ -24,7 +24,8 @@ grpc-raft-conn-num = 1 # Disable stats concurrency. procinfo performs too bad without optimization, # disable it to save CPU for real tests. stats-concurrency = 0 -raft-client-backoff-step = "5ms" +raft-client-max-backoff = "100ms" +raft-client-initial-reconnect-backoff = "100ms" [server.labels] @@ -33,7 +34,6 @@ scheduler-concurrency = 10 scheduler-worker-pool-size = 1 [storage.block-cache] -shared = true capacity = "64MB" [pd] @@ -65,6 +65,7 @@ raft-store-max-leader-lease = "240ms" allow-remove-leader = true merge-check-tick-interval = "100ms" pd-heartbeat-tick-interval = "20ms" +max-entry-cache-warmup-duration = "0ms" dev-assert = true hibernate-regions = true store-io-pool-size = 0 diff --git a/components/test_raftstore/src/config.rs b/components/test_raftstore/src/config.rs index 15748773409..a86b8eb1bf0 100644 --- a/components/test_raftstore/src/config.rs +++ b/components/test_raftstore/src/config.rs @@ -2,25 +2,25 @@ use std::ops::{Deref, DerefMut}; -use tikv::config::TiKvConfig; +use tikv::config::TikvConfig; #[derive(Clone)] pub struct Config { - pub tikv: TiKvConfig, + pub tikv: TikvConfig, pub prefer_mem: bool, } impl Deref for Config { - type Target = TiKvConfig; + type Target = TikvConfig; #[inline] - fn deref(&self) -> &TiKvConfig { + fn deref(&self) -> &TikvConfig { &self.tikv } } impl DerefMut for Config { #[inline] - fn deref_mut(&mut self) -> &mut TiKvConfig { + fn deref_mut(&mut self) -> &mut TikvConfig { &mut self.tikv } } diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 82695be12ba..950581a6ce8 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -8,13 +8,11 @@ extern crate tikv_util; mod cluster; mod config; mod node; -mod pd; mod router; mod server; mod transport_simulate; -mod util; +pub mod util; pub use crate::{ - cluster::*, config::Config, node::*, pd::*, router::*, server::*, transport_simulate::*, - util::*, + cluster::*, config::Config, node::*, router::*, server::*, transport_simulate::*, util::*, }; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 27cbd367ba7..75ab0064a17 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -17,6 +17,7 @@ use kvproto::{ raft_cmdpb::*, raft_serverpb::{self, RaftMessage}, }; +use protobuf::Message; use raft::{eraftpb::MessageType, SnapshotStatus}; use raftstore::{ coprocessor::{config::SplitCheckConfigManager, CoprocessorHost}, @@ -29,8 +30,10 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::{ config::{ConfigController, Module}, import::SstImporter, @@ -94,7 +97,10 @@ impl Transport for ChannelTransport { Some(p) => { p.0.register(key.clone(), SnapEntry::Receiving); let data = msg.get_message().get_snapshot().get_data(); - p.0.get_snapshot_for_receiving(&key, data).unwrap() + let mut snapshot_data = raft_serverpb::RaftSnapshotData::default(); + snapshot_data.merge_from_bytes(data).unwrap(); + p.0.get_snapshot_for_receiving(&key, snapshot_data.take_meta()) + .unwrap() } None => return Err(box_err!("missing temp dir for store {}", to_store)), }; @@ -187,8 +193,8 @@ impl NodeCluster { .unwrap() } - // Set a function that will be invoked after creating each CoprocessorHost. The first argument - // of `op` is the node_id. + // Set a function that will be invoked after creating each CoprocessorHost. The + // first argument of `op` is the node_id. // Set this before invoking `run_node`. #[allow(clippy::type_complexity)] pub fn post_create_coprocessor_host( @@ -224,6 +230,7 @@ impl Simulator for NodeCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + _resource_manager: &Option>, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); let pd_worker = LazyWorker::new("test-pd-worker"); @@ -232,8 +239,8 @@ impl Simulator for NodeCluster { let mut raft_store = cfg.raft_store.clone(); raft_store .validate( - cfg.coprocessor.region_split_size, - cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.region_split_size(), + cfg.coprocessor.enable_region_bucket(), cfg.coprocessor.region_bucket_size, ) .unwrap(); @@ -247,6 +254,7 @@ impl Simulator for NodeCluster { Arc::default(), bg_worker.clone(), None, + None, ); let (snap_mgr, snap_mgr_path) = if node_id == 0 @@ -260,10 +268,12 @@ impl Simulator for NodeCluster { { let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); let snap_mgr = SnapManagerBuilder::default() - .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) + .max_write_bytes_per_sec(cfg.server.snap_io_max_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) + .enable_multi_snapshot_files(true) + .enable_receive_tablet_snapshot(cfg.raft_store.enable_v2_compatible_learner) .build(tmp.path().to_str().unwrap()); (snap_mgr, Some(tmp)) } else { @@ -290,7 +300,11 @@ impl Simulator for NodeCluster { Arc::new(SstImporter::new(&cfg.import, dir, None, cfg.storage.api_version()).unwrap()) }; - let local_reader = LocalReader::new(engines.kv.clone(), store_meta.clone(), router.clone()); + let local_reader = LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + router.clone(), + ); let cfg_controller = ConfigController::new(cfg.tikv.clone()); let split_check_runner = @@ -314,6 +328,7 @@ impl Simulator for NodeCluster { AutoSplitController::default(), cm, CollectorRegHandle::new_for_test(), + None, )?; assert!( engines @@ -333,8 +348,8 @@ impl Simulator for NodeCluster { .map(|p| p.path().to_str().unwrap().to_owned()) ); - let region_split_size = cfg.coprocessor.region_split_size; - let enable_region_bucket = cfg.coprocessor.enable_region_bucket; + let region_split_size = cfg.coprocessor.region_split_size(); + let enable_region_bucket = cfg.coprocessor.enable_region_bucket(); let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; raftstore_cfg @@ -433,7 +448,7 @@ impl Simulator for NodeCluster { } fn async_read( - &self, + &mut self, node_id: u64, batch_id: Option, request: RaftCmdRequest, @@ -491,12 +506,16 @@ impl Simulator for NodeCluster { } } +// Compare to server cluster, node cluster does not have server layer and +// storage layer. pub fn new_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } +// This cluster does not support batch split, we expect it to transfer the +// `BatchSplit` request to `split` request pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index d156ab77adb..ec6cb0a235c 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -9,7 +9,7 @@ use std::{ }; use api_version::{dispatch_api_version, KvFormat}; -use causal_ts::CausalTsProvider; +use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; @@ -33,39 +33,47 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CoprocessorHost, RegionInfoAccessor}, errors::Error as RaftError, - router::{LocalReadRouter, RaftStoreBlackHole, RaftStoreRouter, ServerRaftStoreRouter}, + router::{CdcRaftRouter, LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, AutoSplitController, Callback, CheckLeaderRunner, LocalReader, RegionSnapshot, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, + SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::{CollectorRegHandle, ResourceTagFactory}; use security::SecurityManager; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::{ config::ConfigController, coprocessor, coprocessor_v2, import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ - create_raft_storage, gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, resolve::{self, StoreAddrResolver}, service::DebugService, + tablet_snap::NoSnapshotCache, ConnectionBuilder, Error, Node, PdStoreAddrResolver, RaftClient, RaftKv, Result as ServerResult, Server, ServerTransport, }, - storage::{self, kv::SnapContext, txn::flow_controller::FlowController, Engine}, + storage::{ + self, + kv::{FakeExtension, LocalTablets, SnapContext}, + txn::flow_controller::{EngineFlowController, FlowController}, + Engine, Storage, + }, }; use tikv_util::{ config::VersionTrack, quota_limiter::QuotaLimiter, + sys::thread::ThreadBuildWrapper, time::ThreadReadId, worker::{Builder as WorkerBuilder, LazyWorker}, HandyRwLock, @@ -77,10 +85,11 @@ use super::*; use crate::Config; type SimulateStoreTransport = SimulateTransport>; -type SimulateServerTransport = - SimulateTransport>; pub type SimulateEngine = RaftKv; +type SimulateRaftExtension = ::RaftExtension; +type SimulateServerTransport = + SimulateTransport>; #[derive(Default, Clone)] pub struct AddressMap { @@ -118,13 +127,13 @@ impl StoreAddrResolver for AddressMap { struct ServerMeta { node: Node, - server: Server, + server: Server, sim_router: SimulateStoreTransport, sim_trans: SimulateServerTransport, raw_router: RaftRouter, raw_apply_router: ApplyRouter, - gc_worker: GcWorker, SimulateStoreTransport>, - rts_worker: Option>>, + gc_worker: GcWorker>, + rts_worker: Option>, rsmeter_cleanup: Box, } @@ -145,10 +154,11 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_clients: HashMap>, + conn_builder: ConnectionBuilder, concurrency_managers: HashMap, env: Arc, - pub causal_ts_providers: HashMap>, + pub causal_ts_providers: HashMap>, } impl ServerCluster { @@ -161,18 +171,18 @@ impl ServerCluster { ); let security_mgr = Arc::new(SecurityManager::new(&Default::default()).unwrap()); let map = AddressMap::default(); - // We don't actually need to handle snapshot message, just create a dead worker to make it compile. + // We don't actually need to handle snapshot message, just create a dead worker + // to make it compile. let worker = LazyWorker::new("snap-worker"); let conn_builder = ConnectionBuilder::new( env.clone(), Arc::default(), security_mgr.clone(), map.clone(), - RaftStoreBlackHole, + FakeExtension, worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let raft_client = RaftClient::new(conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -186,7 +196,8 @@ impl ServerCluster { pending_services: HashMap::default(), coprocessor_hooks: HashMap::default(), health_services: HashMap::default(), - raft_client, + raft_clients: HashMap::default(), + conn_builder, concurrency_managers: HashMap::default(), env, txn_extra_schedulers: HashMap::default(), @@ -210,7 +221,7 @@ impl ServerCluster { pub fn get_gc_worker( &self, node_id: u64, - ) -> &GcWorker, SimulateStoreTransport> { + ) -> &GcWorker> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -218,7 +229,7 @@ impl ServerCluster { self.concurrency_managers.get(&node_id).unwrap().clone() } - pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { + pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { self.causal_ts_providers.get(&node_id).cloned() } @@ -256,6 +267,7 @@ impl ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Option>, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { let p = test_util::temp_dir("test_cluster", cfg.prefer_mem); @@ -277,16 +289,24 @@ impl ServerCluster { } } - let local_reader = LocalReader::new(engines.kv.clone(), store_meta.clone(), router.clone()); - let raft_router = ServerRaftStoreRouter::new(router.clone(), local_reader); - let sim_router = SimulateTransport::new(raft_router.clone()); - - let raft_engine = RaftKv::new(sim_router.clone(), engines.kv.clone()); + let local_reader = LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + router.clone(), + ); // Create coprocessor. let mut coprocessor_host = CoprocessorHost::new(router.clone(), cfg.coprocessor.clone()); let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + let raft_router = ServerRaftStoreRouter::new(router.clone(), local_reader); + let sim_router = SimulateTransport::new(raft_router.clone()); + let raft_engine = RaftKv::new( + sim_router.clone(), + engines.kv.clone(), + region_info_accessor.region_leaders(), + ); + if let Some(hooks) = self.coprocessor_hooks.get(&node_id) { for hook in hooks { hook(&mut coprocessor_host); @@ -302,7 +322,11 @@ impl ServerCluster { raft_engine.clone(), )); - let mut engine = RaftKv::new(sim_router.clone(), engines.kv.clone()); + let mut engine = RaftKv::new( + sim_router.clone(), + engines.kv.clone(), + region_info_accessor.region_leaders(), + ); if let Some(scheduler) = self.txn_extra_schedulers.remove(&node_id) { engine.set_txn_extra_scheduler(scheduler); } @@ -314,32 +338,30 @@ impl ServerCluster { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( engine.clone(), - sim_router.clone(), tx, cfg.gc.clone(), Default::default(), + Arc::new(region_info_accessor.clone()), ); - gc_worker.start().unwrap(); - gc_worker - .start_observe_lock_apply(&mut coprocessor_host, concurrency_manager.clone()) - .unwrap(); + gc_worker.start(node_id).unwrap(); let rts_worker = if cfg.resolved_ts.enable { // Resolved ts worker let mut rts_worker = LazyWorker::new("resolved-ts"); let rts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); rts_ob.register_to(&mut coprocessor_host); + // resolved ts endpoint needs store id. + store_meta.lock().unwrap().store_id = Some(node_id); // Resolved ts endpoint let rts_endpoint = resolved_ts::Endpoint::new( &cfg.resolved_ts, rts_worker.scheduler(), - raft_router.clone(), + CdcRaftRouter(raft_router), store_meta.clone(), self.pd_client.clone(), concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); // Start the worker rts_worker.start(rts_endpoint); @@ -349,25 +371,26 @@ impl ServerCluster { }; if ApiVersion::V2 == F::TAG { - let causal_ts_provider = Arc::new( + let causal_ts_provider: Arc = Arc::new( block_on(causal_ts::BatchTsoProvider::new_opt( self.pd_client.clone(), cfg.causal_ts.renew_interval.0, + cfg.causal_ts.alloc_ahead_buffer.0, cfg.causal_ts.renew_batch_min_size, + cfg.causal_ts.renew_batch_max_size, )) - .unwrap(), + .unwrap() + .into(), ); - self.causal_ts_providers - .insert(node_id, causal_ts_provider.clone()); - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider); - causal_ob.register_to(&mut coprocessor_host); + self.causal_ts_providers.insert(node_id, causal_ts_provider); } // Start resource metering. let (res_tag_factory, collector_reg_handle, rsmeter_cleanup) = self.init_resource_metering(&cfg.resource_metering); - let check_leader_runner = CheckLeaderRunner::new(store_meta.clone()); + let check_leader_runner = + CheckLeaderRunner::new(store_meta.clone(), coprocessor_host.clone()); let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); let mut lock_mgr = LockManager::new(&cfg.pessimistic_txn); @@ -375,20 +398,29 @@ impl ServerCluster { cfg.quota.foreground_cpu_time, cfg.quota.foreground_write_bandwidth, cfg.quota.foreground_read_bandwidth, + cfg.quota.background_cpu_time, + cfg.quota.background_write_bandwidth, + cfg.quota.background_read_bandwidth, cfg.quota.max_delay_duration, + cfg.quota.enable_auto_tune, )); - let store = create_raft_storage::<_, _, _, F>( - engine, + let extension = engine.raft_extension(); + let store = Storage::<_, _, F>::from_engine( + engine.clone(), &cfg.storage, storage_read_pool.handle(), lock_mgr.clone(), concurrency_manager.clone(), lock_mgr.get_storage_dynamic_configs(), - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), pd_sender, res_tag_factory.clone(), quota_limiter.clone(), self.pd_client.feature_gate().clone(), + self.get_causal_ts_provider(node_id), + resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; self.storages.insert(node_id, raft_engine); @@ -410,8 +442,8 @@ impl ServerCluster { let import_service = ImportSstService::new( cfg.import.clone(), cfg.raft_store.raft_entry_max_size, - sim_router.clone(), - engines.kv.clone(), + engine, + LocalTablets::Singleton(engines.kv.clone()), Arc::clone(&importer), ); @@ -420,12 +452,14 @@ impl ServerCluster { // Create pd client, snapshot manager, server. let (resolver, state) = - resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, router.clone()); + resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, extension.clone()); let snap_mgr = SnapManagerBuilder::default() - .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) + .max_write_bytes_per_sec(cfg.server.snap_io_max_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) + .enable_multi_snapshot_files(true) + .enable_receive_tablet_snapshot(cfg.raft_store.enable_v2_compatible_learner) .build(tmp_str); self.snap_mgrs.insert(node_id, snap_mgr.clone()); let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); @@ -448,14 +482,18 @@ impl ServerCluster { TokioBuilder::new_multi_thread() .thread_name(thd_name!("debugger")) .worker_threads(1) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(), ); let debug_thread_handle = debug_thread_pool.handle().clone(); let debug_service = DebugService::new( engines.clone(), + None, + None, debug_thread_handle, - raft_router, + extension, ConfigController::default(), ); @@ -464,8 +502,8 @@ impl ServerCluster { let mut raft_store = cfg.raft_store.clone(); raft_store .validate( - cfg.coprocessor.region_split_size, - cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.region_split_size(), + cfg.coprocessor.enable_region_bucket(), cfg.coprocessor.region_bucket_size, ) .unwrap(); @@ -479,6 +517,7 @@ impl ServerCluster { state, bg_worker.clone(), Some(health_service.clone()), + None, ); node.try_bootstrap_store(engines.clone())?; let node_id = node.id(); @@ -491,15 +530,15 @@ impl ServerCluster { store.clone(), copr.clone(), copr_v2.clone(), - sim_router.clone(), resolver.clone(), - snap_mgr.clone(), + tikv_util::Either::Left(snap_mgr.clone()), gc_worker.clone(), check_leader_scheduler.clone(), self.env.clone(), None, debug_thread_pool.clone(), health_service.clone(), + resource_manager.clone(), ) .unwrap(); svr.register_service(create_import_sst(import_service.clone())); @@ -529,11 +568,13 @@ impl ServerCluster { cfg.server.addr = format!("{}", addr); let trans = server.transport(); let simulate_trans = SimulateTransport::new(trans); + let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); // Register the role change observer of the lock manager. lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); + let max_unified_read_pool_thread_count = cfg.readpool.unified.max_thread_count; let pessimistic_txn_cfg = cfg.tikv.pessimistic_txn; let split_check_runner = @@ -541,7 +582,14 @@ impl ServerCluster { let split_check_scheduler = bg_worker.start("split-check", split_check_runner); let split_config_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(cfg.tikv.split))); - let auto_split_controller = AutoSplitController::new(split_config_manager); + let auto_split_controller = AutoSplitController::new( + split_config_manager, + max_grpc_thread_count, + max_unified_read_pool_thread_count, + None, + ); + + let causal_ts_provider = self.get_causal_ts_provider(node_id); node.start( engines, simulate_trans.clone(), @@ -554,6 +602,7 @@ impl ServerCluster { auto_split_controller, concurrency_manager.clone(), collector_reg_handle, + causal_ts_provider, )?; assert!(node_id == 0 || node_id == node.id()); let node_id = node.id(); @@ -575,7 +624,9 @@ impl ServerCluster { ) .unwrap(); - server.start(server_cfg, security_mgr).unwrap(); + server + .start(server_cfg, security_mgr, NoSnapshotCache) + .unwrap(); self.metas.insert( node_id, @@ -595,6 +646,8 @@ impl ServerCluster { self.concurrency_managers .insert(node_id, concurrency_manager); + let client = RaftClient::new(node_id, self.conn_builder.clone()); + self.raft_clients.insert(node_id, client); Ok(node_id) } } @@ -609,6 +662,7 @@ impl Simulator for ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( cfg.storage.api_version(), @@ -620,6 +674,7 @@ impl Simulator for ServerCluster { key_manager, router, system, + resource_manager, ) ) } @@ -646,6 +701,7 @@ impl Simulator for ServerCluster { } (meta.rsmeter_cleanup)(); } + let _ = self.raft_clients.remove(&node_id); } fn get_node_ids(&self) -> HashSet { @@ -667,13 +723,13 @@ impl Simulator for ServerCluster { } fn async_read( - &self, + &mut self, node_id: u64, batch_id: Option, request: RaftCmdRequest, cb: Callback, ) { - match self.metas.get(&node_id) { + match self.metas.get_mut(&node_id) { None => { let e: RaftError = box_err!("missing sender for store {}", node_id); let mut resp = RaftCmdResponse::default(); @@ -687,8 +743,12 @@ impl Simulator for ServerCluster { } fn send_raft_msg(&mut self, raft_msg: raft_serverpb::RaftMessage) -> Result<()> { - self.raft_client.send(raft_msg).unwrap(); - self.raft_client.flush(); + let from_store = raft_msg.get_from_peer().store_id; + assert_ne!(from_store, 0); + if let Some(client) = self.raft_clients.get_mut(&from_store) { + client.send(raft_msg).unwrap(); + client.flush(); + } Ok(()) } @@ -740,7 +800,7 @@ impl Cluster { ctx.set_peer(leader); ctx.set_region_epoch(epoch); - let storage = self.sim.rl().storages.get(&store_id).unwrap().clone(); + let mut storage = self.sim.rl().storages.get(&store_id).unwrap().clone(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -755,6 +815,27 @@ impl Cluster { } panic!("failed to get snapshot of region {}", region_id); } + + pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { + self.sim.rl().storages[&node_id].raft_extension() + } + + pub fn get_addr(&self, node_id: u64) -> String { + self.sim.rl().get_addr(node_id) + } + + pub fn register_hook( + &self, + node_id: u64, + register: Box)>, + ) { + self.sim + .wl() + .coprocessor_hooks + .entry(node_id) + .or_default() + .push(register); + } } pub fn new_server_cluster(id: u64, count: usize) -> Cluster { diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 9ebba64aa48..ef569e3987a 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -162,7 +162,7 @@ impl SimulateTransport { } } -fn filter_send( +pub fn filter_send( filters: &Arc>>>, msg: RaftMessage, mut h: H, @@ -251,7 +251,7 @@ impl> RaftStoreRouter for SimulateT impl> LocalReadRouter for SimulateTransport { fn read( - &self, + &mut self, read_id: Option, req: RaftCmdRequest, cb: Callback, @@ -259,7 +259,7 @@ impl> LocalReadRouter for SimulateT self.ch.read(read_id, req, cb) } - fn release_snapshot_cache(&self) { + fn release_snapshot_cache(&mut self) { self.ch.release_snapshot_cache() } } @@ -273,7 +273,7 @@ pub struct DefaultFilterFactory(PhantomData); impl FilterFactory for DefaultFilterFactory { fn generate(&self, _: u64) -> Vec> { - vec![Box::new(F::default())] + vec![Box::::default()] } } @@ -314,9 +314,9 @@ impl FilterFactory for PartitionFilterFactory { node_ids: self.s2.clone(), })]; } - return vec![Box::new(PartitionFilter { + vec![Box::new(PartitionFilter { node_ids: self.s1.clone(), - })]; + })] } } @@ -507,10 +507,11 @@ impl Filter for SnapshotFilter { } } -/// `CollectSnapshotFilter` is a simulation transport filter to simulate the simultaneous delivery -/// of multiple snapshots from different peers. It collects the snapshots from different -/// peers and drop the subsequent snapshots from the same peers. Currently, if there are -/// more than 1 snapshots in this filter, all the snapshots will be dilivered at once. +/// `CollectSnapshotFilter` is a simulation transport filter to simulate the +/// simultaneous delivery of multiple snapshots from different peers. It +/// collects the snapshots from different peers and drop the subsequent +/// snapshots from the same peers. Currently, if there are more than 1 snapshots +/// in this filter, all the snapshots will be delivered at once. pub struct CollectSnapshotFilter { dropped: AtomicBool, stale: AtomicBool, @@ -753,10 +754,11 @@ impl Filter for LeadingDuplicatedSnapshotFilter { } } -/// `RandomLatencyFilter` is a transport filter to simulate randomized network latency. -/// Based on a randomized rate, `RandomLatencyFilter` will decide whether to delay -/// the sending of any message. It's could be used to simulate the message sending -/// in a network with random latency, where messages could be delayed, disordered or lost. +/// `RandomLatencyFilter` is a transport filter to simulate randomized network +/// latency. Based on a randomized rate, `RandomLatencyFilter` will decide +/// whether to delay the sending of any message. It's could be used to simulate +/// the message sending in a network with random latency, where messages could +/// be delayed, disordered or lost. pub struct RandomLatencyFilter { delay_rate: u32, delayed_msgs: Mutex>, @@ -829,18 +831,18 @@ impl Filter for LeaseReadFilter { #[derive(Clone)] pub struct DropMessageFilter { - ty: MessageType, + retain: Arc bool + Sync + Send>, } impl DropMessageFilter { - pub fn new(ty: MessageType) -> DropMessageFilter { - DropMessageFilter { ty } + pub fn new(retain: Arc bool + Sync + Send>) -> DropMessageFilter { + DropMessageFilter { retain } } } impl Filter for DropMessageFilter { fn before(&self, msgs: &mut Vec) -> Result<()> { - msgs.retain(|m| m.get_message().get_msg_type() != self.ty); + msgs.retain(|m| (self.retain)(m)); Ok(()) } } diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 96082bc6fbb..cdfe5c8f475 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -4,7 +4,7 @@ use std::{ fmt::Write, path::Path, str::FromStr, - sync::{mpsc, Arc}, + sync::{mpsc, Arc, Mutex}, thread, time::Duration, }; @@ -13,23 +13,19 @@ use collections::HashMap; use encryption_export::{ data_key_manager_from_config, DataKeyManager, FileConfig, MasterKeyConfig, }; -use engine_rocks::{config::BlobRunMode, raw::DB, Compat, RocksEngine, RocksSnapshot}; +use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, TabletFactory, ALL_CFS, + CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, CF_DEFAULT, CF_RAFT, }; -use file_system::IORateLimiter; +use file_system::IoRateLimiter; use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ encryptionpb::EncryptionMethod, - kvrpcpb::*, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, metapb::{self, RegionEpoch}, - pdpb::{ - ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, - TransferLeader, - }, raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, CmdType, RaftCmdRequest, RaftCmdResponse, Request, StatusCmdType, StatusRequest, @@ -40,24 +36,38 @@ use kvproto::{ tikvpb::TikvClient, }; use pd_client::PdClient; +use protobuf::RepeatedField; use raft::eraftpb::ConfChangeType; -pub use raftstore::store::util::{find_peer, new_learner_peer, new_peer}; use raftstore::{ store::{fsm::RaftRouter, *}, - Result, + RaftRouterCompactedEventSender, Result, }; use rand::RngCore; -use server::server::ConfiguredRaftEngine; +use server::common::ConfiguredRaftEngine; use tempfile::TempDir; -use tikv::{config::*, server::KvEngineFactoryBuilder, storage::point_key_range}; +use test_pd_client::TestPdClient; +use tikv::{ + config::*, + server::KvEngineFactoryBuilder, + storage::{ + kv::{SnapContext, SnapshotExt}, + point_key_range, Engine, Snapshot, + }, +}; +pub use tikv_util::store::{find_peer, new_learner_peer, new_peer}; use tikv_util::{config::*, escape, time::ThreadReadId, worker::LazyWorker, HandyRwLock}; use txn_types::Key; -use crate::{Cluster, Config, ServerCluster, Simulator, TestPdClient}; +use crate::{Cluster, Config, RawEngine, ServerCluster, Simulator}; -pub fn must_get(engine: &Arc, cf: &str, key: &[u8], value: Option<&[u8]>) { +pub fn must_get( + engine: &impl RawEngine, + cf: &str, + key: &[u8], + value: Option<&[u8]>, +) { for _ in 1..300 { - let res = engine.c().get_value_cf(cf, &keys::data_key(key)).unwrap(); + let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); if let (Some(value), Some(res)) = (value, res.as_ref()) { assert_eq!(value, &res[..]); return; @@ -68,7 +78,7 @@ pub fn must_get(engine: &Arc, cf: &str, key: &[u8], value: Option<&[u8]>) { thread::sleep(Duration::from_millis(20)); } debug!("last try to get {}", log_wrappers::hex_encode_upper(key)); - let res = engine.c().get_value_cf(cf, &keys::data_key(key)).unwrap(); + let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); if value.is_none() && res.is_none() || value.is_some() && res.is_some() && value.unwrap() == &*res.unwrap() { @@ -81,19 +91,24 @@ pub fn must_get(engine: &Arc, cf: &str, key: &[u8], value: Option<&[u8]>) { ) } -pub fn must_get_equal(engine: &Arc, key: &[u8], value: &[u8]) { +pub fn must_get_equal(engine: &impl RawEngine, key: &[u8], value: &[u8]) { must_get(engine, "default", key, Some(value)); } -pub fn must_get_none(engine: &Arc, key: &[u8]) { +pub fn must_get_none(engine: &impl RawEngine, key: &[u8]) { must_get(engine, "default", key, None); } -pub fn must_get_cf_equal(engine: &Arc, cf: &str, key: &[u8], value: &[u8]) { +pub fn must_get_cf_equal( + engine: &impl RawEngine, + cf: &str, + key: &[u8], + value: &[u8], +) { must_get(engine, cf, key, Some(value)); } -pub fn must_get_cf_none(engine: &Arc, cf: &str, key: &[u8]) { +pub fn must_get_cf_none(engine: &impl RawEngine, cf: &str, key: &[u8]) { must_get(engine, cf, key, None); } @@ -104,10 +119,10 @@ pub fn must_region_cleared(engine: &Engines, region assert_eq!(state.get_state(), PeerState::Tombstone, "{:?}", state); let start_key = keys::data_key(region.get_start_key()); let end_key = keys::data_key(region.get_end_key()); - for cf in ALL_CFS { + for cf in engine.kv.cf_names() { engine .kv - .scan_cf(cf, &start_key, &end_key, false, |k, v| { + .scan(cf, &start_key, &end_key, false, |k, v| { panic!( "[region {}] unexpected ({:?}, {:?}) in cf {:?}", id, k, v, cf @@ -131,10 +146,10 @@ pub fn must_region_cleared(engine: &Engines, region } lazy_static! { - static ref TEST_CONFIG: TiKvConfig = { + pub static ref TEST_CONFIG: TikvConfig = { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let common_test_cfg = manifest_dir.join("src/common-test.toml"); - TiKvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { + TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { panic!( "invalid auto generated configuration file {}, err {}", manifest_dir.display(), @@ -144,13 +159,13 @@ lazy_static! { }; } -pub fn new_tikv_config(cluster_id: u64) -> TiKvConfig { +pub fn new_tikv_config(cluster_id: u64) -> TikvConfig { let mut cfg = TEST_CONFIG.clone(); cfg.server.cluster_id = cluster_id; cfg } -pub fn new_tikv_config_with_api_ver(cluster_id: u64, api_ver: ApiVersion) -> TiKvConfig { +pub fn new_tikv_config_with_api_ver(cluster_id: u64, api_ver: ApiVersion) -> TikvConfig { let mut cfg = TEST_CONFIG.clone(); cfg.server.cluster_id = cluster_id; cfg.storage.set_api_version(api_ver); @@ -303,7 +318,6 @@ pub fn new_transfer_leader_cmd(peer: metapb::Peer) -> AdminRequest { cmd } -#[allow(dead_code)] pub fn new_prepare_merge(target_region: metapb::Region) -> AdminRequest { let mut cmd = AdminRequest::default(); cmd.set_cmd_type(AdminCmdType::PrepareMerge); @@ -333,59 +347,6 @@ pub fn is_error_response(resp: &RaftCmdResponse) -> bool { resp.get_header().has_error() } -pub fn new_pd_change_peer( - change_type: ConfChangeType, - peer: metapb::Peer, -) -> RegionHeartbeatResponse { - let mut change_peer = ChangePeer::default(); - change_peer.set_change_type(change_type); - change_peer.set_peer(peer); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_change_peer(change_peer); - resp -} - -pub fn new_pd_change_peer_v2(changes: Vec) -> RegionHeartbeatResponse { - let mut change_peer = ChangePeerV2::default(); - change_peer.set_changes(changes.into()); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_change_peer_v2(change_peer); - resp -} - -pub fn new_split_region(policy: CheckPolicy, keys: Vec>) -> RegionHeartbeatResponse { - let mut split_region = SplitRegion::default(); - split_region.set_policy(policy); - split_region.set_keys(keys.into()); - let mut resp = RegionHeartbeatResponse::default(); - resp.set_split_region(split_region); - resp -} - -pub fn new_pd_transfer_leader( - peer: metapb::Peer, - peers: Vec, -) -> RegionHeartbeatResponse { - let mut transfer_leader = TransferLeader::default(); - transfer_leader.set_peer(peer); - transfer_leader.set_peers(peers.into()); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_transfer_leader(transfer_leader); - resp -} - -pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResponse { - let mut merge = Merge::default(); - merge.set_target(target_region); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_merge(merge); - resp -} - #[derive(Default)] struct CallbackLeakDetector { called: bool, @@ -420,7 +381,7 @@ pub fn make_cb(cmd: &RaftCmdRequest) -> (Callback, mpsc::Receiver let (tx, rx) = mpsc::channel(); let mut detector = CallbackLeakDetector::default(); let cb = if is_read { - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { detector.called = true; // we don't care error actually. let _ = tx.send(resp.response); @@ -485,7 +446,7 @@ pub fn async_read_on_peer( request.mut_header().set_peer(peer); request.mut_header().set_replica_read(replica_read); let (tx, rx) = mpsc::sync_channel(1); - let cb = Callback::Read(Box::new(move |resp| drop(tx.send(resp.response)))); + let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); cluster.sim.wl().async_read(node_id, None, request, cb); rx } @@ -508,7 +469,7 @@ pub fn batch_read_on_peer( ); request.mut_header().set_peer(peer.clone()); let t = tx.clone(); - let cb = Callback::Read(Box::new(move |resp| { + let cb = Callback::read(Box::new(move |resp| { t.send((len, resp)).unwrap(); })); cluster @@ -562,7 +523,7 @@ pub fn async_read_index_on_peer( ); request.mut_header().set_peer(peer); let (tx, rx) = mpsc::sync_channel(1); - let cb = Callback::Read(Box::new(move |resp| drop(tx.send(resp.response)))); + let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); cluster.sim.wl().async_read(node_id, None, request, cb); rx } @@ -625,13 +586,15 @@ pub fn must_contains_error(resp: &RaftCmdResponse, msg: &str) { pub fn create_test_engine( // TODO: pass it in for all cases. router: Option>, - limiter: Option>, + limiter: Option>, cfg: &Config, ) -> ( Engines, Option>, TempDir, LazyWorker, + Arc, + Option>, ) { let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); let mut cfg = cfg.clone(); @@ -642,7 +605,10 @@ pub fn create_test_engine( data_key_manager_from_config(&cfg.security.encryption, dir.path().to_str().unwrap()) .unwrap() .map(Arc::new); - let cache = cfg.storage.block_cache.build_shared_cache(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); let env = cfg .build_shared_rocks_env(key_manager.clone(), limiter) .unwrap(); @@ -650,20 +616,26 @@ pub fn create_test_engine( let sst_worker = LazyWorker::new("sst-recovery"); let scheduler = sst_worker.scheduler(); - let raft_engine = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); + let (raft_engine, raft_statistics) = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); let mut builder = - KvEngineFactoryBuilder::new(env, &cfg, dir.path()).sst_recovery_sender(Some(scheduler)); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + KvEngineFactoryBuilder::new(env, &cfg, cache).sst_recovery_sender(Some(scheduler)); if let Some(router) = router { - builder = builder.compaction_filter_router(router); + builder = builder.compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { + router: Mutex::new(router), + })); } let factory = builder.build(); - let engine = factory.create_tablet().unwrap(); + let engine = factory.create_shared_db(dir.path()).unwrap(); let engines = Engines::new(engine, raft_engine); - (engines, key_manager, dir, sst_worker) + ( + engines, + key_manager, + dir, + sst_worker, + factory.rocks_statistics(), + raft_statistics, + ) } pub fn configure_for_request_snapshot(cluster: &mut Cluster) { @@ -680,54 +652,54 @@ pub fn configure_for_hibernate(cluster: &mut Cluster) { cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::secs(10); } -pub fn configure_for_snapshot(cluster: &mut Cluster) { +pub fn configure_for_snapshot(config: &mut Config) { // Truncate the log quickly so that we can force sending snapshot. - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(2); - cluster.cfg.raft_store.merge_max_log_gap = 1; - cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); + config.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + config.raft_store.raft_log_gc_count_limit = Some(2); + config.raft_store.merge_max_log_gap = 1; + config.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); } -pub fn configure_for_merge(cluster: &mut Cluster) { +pub fn configure_for_merge(config: &mut Config) { // Avoid log compaction which will prevent merge. - cluster.cfg.raft_store.raft_log_gc_threshold = 1000; - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); - cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); + config.raft_store.raft_log_gc_threshold = 1000; + config.raft_store.raft_log_gc_count_limit = Some(1000); + config.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); // Make merge check resume quickly. - cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); + config.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); // When isolated, follower relies on stale check tick to detect failure leader, // choose a smaller number to make it recover faster. - cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); + config.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); } -pub fn ignore_merge_target_integrity(cluster: &mut Cluster) { - cluster.cfg.raft_store.dev_assert = false; - cluster.pd_client.ignore_merge_target_integrity(); +pub fn ignore_merge_target_integrity(config: &mut Config, pd_client: &TestPdClient) { + config.raft_store.dev_assert = false; + pd_client.ignore_merge_target_integrity(); } -pub fn configure_for_lease_read( - cluster: &mut Cluster, +pub fn configure_for_lease_read( + cfg: &mut Config, base_tick_ms: Option, election_ticks: Option, ) -> Duration { if let Some(base_tick_ms) = base_tick_ms { - cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); } - let base_tick_interval = cluster.cfg.raft_store.raft_base_tick_interval.0; + let base_tick_interval = cfg.raft_store.raft_base_tick_interval.0; if let Some(election_ticks) = election_ticks { - cluster.cfg.raft_store.raft_election_timeout_ticks = election_ticks; + cfg.raft_store.raft_election_timeout_ticks = election_ticks; } - let election_ticks = cluster.cfg.raft_store.raft_election_timeout_ticks as u32; + let election_ticks = cfg.raft_store.raft_election_timeout_ticks as u32; let election_timeout = base_tick_interval * election_ticks; // Adjust max leader lease. - cluster.cfg.raft_store.raft_store_max_leader_lease = + cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(election_timeout - base_tick_interval); - // Use large peer check interval, abnormal and max leader missing duration to make a valid config, - // that is election timeout x 2 < peer stale state check < abnormal < max leader missing duration. - cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); - cluster.cfg.raft_store.abnormal_leader_missing_duration = - ReadableDuration(election_timeout * 4); - cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration(election_timeout * 5); + // Use large peer check interval, abnormal and max leader missing duration to + // make a valid config, that is election timeout x 2 < peer stale state + // check < abnormal < max leader missing duration. + cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); + cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration(election_timeout * 4); + cfg.raft_store.max_leader_missing_duration = ReadableDuration(election_timeout * 5); election_timeout } @@ -870,6 +842,41 @@ pub fn must_kv_read_equal(client: &TikvClient, ctx: Context, key: Vec, val: assert_eq!(get_resp.take_value(), val); } +pub fn write_and_read_key( + client: &TikvClient, + ctx: &Context, + ts: &mut u64, + k: Vec, + v: Vec, +) { + // Prewrite + let prewrite_start_version = *ts + 1; + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite( + client, + ctx.clone(), + vec![mutation], + k.clone(), + prewrite_start_version, + ); + // Commit + let commit_version = *ts + 2; + must_kv_commit( + client, + ctx.clone(), + vec![k.clone()], + prewrite_start_version, + commit_version, + commit_version, + ); + // Get + *ts += 3; + must_kv_read_equal(client, ctx.clone(), k, v, *ts); +} + pub fn kv_read(client: &TikvClient, ctx: Context, key: Vec, ts: u64) -> GetResponse { let mut get_req = GetRequest::default(); get_req.set_context(ctx); @@ -878,6 +885,19 @@ pub fn kv_read(client: &TikvClient, ctx: Context, key: Vec, ts: u64) -> GetR client.kv_get(&get_req).unwrap() } +pub fn kv_batch_read( + client: &TikvClient, + ctx: Context, + keys: Vec>, + ts: u64, +) -> BatchGetResponse { + let mut batch_get_req = BatchGetRequest::default(); + batch_get_req.set_context(ctx); + batch_get_req.set_keys(RepeatedField::from(keys)); + batch_get_req.set_version(ts); + client.kv_batch_get(&batch_get_req).unwrap() +} + pub fn must_kv_prewrite_with( client: &TikvClient, ctx: Context, @@ -891,7 +911,7 @@ pub fn must_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.is_pessimistic_lock = vec![true; muts.len()]; + prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -914,7 +934,6 @@ pub fn must_kv_prewrite_with( ); } -// Disk full test interface. pub fn try_kv_prewrite_with( client: &TikvClient, ctx: Context, @@ -928,7 +947,7 @@ pub fn try_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.is_pessimistic_lock = vec![true; muts.len()]; + prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -1028,6 +1047,39 @@ pub fn kv_pessimistic_lock( kv_pessimistic_lock_with_ttl(client, ctx, keys, ts, for_update_ts, return_values, 20) } +pub fn kv_pessimistic_lock_resumable( + client: &TikvClient, + ctx: Context, + keys: Vec>, + ts: u64, + for_update_ts: u64, + wait_timeout: Option, + return_values: bool, + check_existence: bool, +) -> PessimisticLockResponse { + let mut req = PessimisticLockRequest::default(); + req.set_context(ctx); + let primary = keys[0].clone(); + let mut mutations = vec![]; + for key in keys { + let mut mutation = Mutation::default(); + mutation.set_op(Op::PessimisticLock); + mutation.set_key(key); + mutations.push(mutation); + } + req.set_mutations(mutations.into()); + req.primary_lock = primary; + req.start_version = ts; + req.for_update_ts = for_update_ts; + req.lock_ttl = 20; + req.is_first_lock = false; + req.wait_timeout = wait_timeout.unwrap_or(-1); + req.set_wake_up_mode(PessimisticLockWakeUpMode::WakeUpModeForceLock); + req.return_values = return_values; + req.check_existence = check_existence; + client.kv_pessimistic_lock(&req).unwrap() +} + pub fn kv_pessimistic_lock_with_ttl( client: &TikvClient, ctx: Context, @@ -1063,12 +1115,18 @@ pub fn must_kv_pessimistic_lock(client: &TikvClient, ctx: Context, key: Vec, assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); } -pub fn must_kv_pessimistic_rollback(client: &TikvClient, ctx: Context, key: Vec, ts: u64) { +pub fn must_kv_pessimistic_rollback( + client: &TikvClient, + ctx: Context, + key: Vec, + ts: u64, + for_update_ts: u64, +) { let mut req = PessimisticRollbackRequest::default(); req.set_context(ctx); req.set_keys(vec![key].into_iter().collect()); req.start_version = ts; - req.for_update_ts = ts; + req.for_update_ts = for_update_ts; let resp = client.kv_pessimistic_rollback(&req).unwrap(); assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); @@ -1095,57 +1153,6 @@ pub fn must_check_txn_status( resp } -pub fn must_physical_scan_lock( - client: &TikvClient, - ctx: Context, - max_ts: u64, - start_key: &[u8], - limit: usize, -) -> Vec { - let mut req = PhysicalScanLockRequest::default(); - req.set_context(ctx); - req.set_max_ts(max_ts); - req.set_start_key(start_key.to_owned()); - req.set_limit(limit as _); - let mut resp = client.physical_scan_lock(&req).unwrap(); - resp.take_locks().into() -} - -pub fn register_lock_observer(client: &TikvClient, max_ts: u64) -> RegisterLockObserverResponse { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.register_lock_observer(&req).unwrap() -} - -pub fn must_register_lock_observer(client: &TikvClient, max_ts: u64) { - let resp = register_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); -} - -pub fn check_lock_observer(client: &TikvClient, max_ts: u64) -> CheckLockObserverResponse { - let mut req = CheckLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.check_lock_observer(&req).unwrap() -} - -pub fn must_check_lock_observer(client: &TikvClient, max_ts: u64, clean: bool) -> Vec { - let mut resp = check_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert_eq!(resp.get_is_clean(), clean); - resp.take_locks().into() -} - -pub fn remove_lock_observer(client: &TikvClient, max_ts: u64) -> RemoveLockObserverResponse { - let mut req = RemoveLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.remove_lock_observer(&req).unwrap() -} - -pub fn must_remove_lock_observer(client: &TikvClient, max_ts: u64) { - let resp = remove_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); -} - pub fn get_tso(pd_client: &TestPdClient) -> u64 { block_on(pd_client.get_tso()).unwrap().into_inner() } @@ -1167,7 +1174,8 @@ pub fn check_compacted( compact_count: u64, must_compacted: bool, ) -> bool { - // Every peer must have compacted logs, so the truncate log state index/term must > than before. + // Every peer must have compacted logs, so the truncate log state index/term + // must > than before. let mut compacted_idx = HashMap::default(); for (&id, engines) in all_engines { @@ -1257,6 +1265,48 @@ pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option, ts: u64) { - must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts) + must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts, ts) } } @@ -1350,3 +1400,33 @@ pub fn peer_on_store(region: &metapb::Region, store_id: u64) -> metapb::Peer { .unwrap() .clone() } + +pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, region_id: u64) { + let mut storage = cluster + .sim + .read() + .unwrap() + .storages + .get(&node_id) + .unwrap() + .clone(); + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + let snapshot = storage.snapshot(snap_ctx).unwrap(); + let txn_ext = snapshot.txn_ext.clone().unwrap(); + for retry in 0..10 { + if txn_ext.is_max_ts_synced() { + break; + } + thread::sleep(Duration::from_millis(1 << retry)); + } + assert!(snapshot.ext().is_max_ts_synced()); +} diff --git a/components/match_template/Cargo.toml b/components/test_raftstore_macro/Cargo.toml similarity index 50% rename from components/match_template/Cargo.toml rename to components/test_raftstore_macro/Cargo.toml index 1f5f683ee92..7a05f56ed3d 100644 --- a/components/match_template/Cargo.toml +++ b/components/test_raftstore_macro/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "match_template" +name = "test_raftstore_macro" version = "0.0.1" edition = "2018" publish = false @@ -8,6 +8,6 @@ publish = false proc-macro = true [dependencies] -proc-macro2 = "1" +proc-macro2 = "1.0" quote = "1" -syn = { version = "1", features = ["full", "extra-traits", "fold"] } +syn = { version = "1", features = ["full", "extra-traits"] } diff --git a/components/test_raftstore_macro/src/lib.rs b/components/test_raftstore_macro/src/lib.rs new file mode 100644 index 00000000000..3c8239d9f3b --- /dev/null +++ b/components/test_raftstore_macro/src/lib.rs @@ -0,0 +1,153 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use proc_macro::TokenStream; +use proc_macro2::{TokenStream as TokenStream2, TokenTree}; +use quote::{quote, ToTokens}; +use syn::{parse_macro_input, parse_quote, Ident, ItemFn, Path}; + +/// test_case generate test cases using cluster creation method provided. +/// It also import the package related util module, which means we should locate +/// methods using Cluster in the related util modules. +/// +/// ex: +/// #[test_case(test_raftstore::new_node_cluster)] +/// #[test_case(test_raftstore::new_server_cluster)] +/// #[test_case(test_raftstore_v2::new_node_cluster)] +/// fn test_something() { +/// let cluster = new_cluster(...) +/// } +/// +/// It generates three test cases as following: +/// +/// #[cfg(test)] +/// mod test_something { +/// #[test] +/// fn test_raftstore_new_node_cluster() { +/// use test_raftstore::(util::*, new_node_cluster as new_cluster); +/// let mut cluster = new_cluster(0, 1); +/// } +/// +/// #[test] +/// fn test_raftstore_new_server_cluster() { +/// use test_raftstore::(util::*, new_server_cluster as new_cluster); +/// let mut cluster = new_cluster(0, 1); +/// } +/// +/// #[test] +/// fn test_raftstore_v2_new_server_cluster() { +/// use test_raftstore::(util::*, test_raftstore_v2 as new_cluster); +/// let mut cluster = new_cluster(0, 1); +/// } +/// } +#[proc_macro_attribute] +pub fn test_case(arg: TokenStream, input: TokenStream) -> TokenStream { + let mut fn_item = parse_macro_input!(input as ItemFn); + let mut test_cases = vec![TokenStream2::from(arg)]; + let mut attrs_to_remove = vec![]; + + let legal_test_case_name: Path = parse_quote!(test_case); + for (idx, attr) in fn_item.attrs.iter().enumerate() { + if legal_test_case_name == attr.path { + test_cases.push(attr.into_token_stream()); + attrs_to_remove.push(idx); + } + } + + for i in attrs_to_remove.into_iter().rev() { + fn_item.attrs.swap_remove(i); + } + + render_test_cases(test_cases, fn_item.clone()) +} + +fn render_test_cases(test_cases: Vec, fn_item: ItemFn) -> TokenStream { + let mut rendered_test_cases: Vec = vec![]; + for case in test_cases { + let mut item = fn_item.clone(); + + // Parse test case to get the package name and the method name + let (package, method) = parse_test_case(case); + let test_name = format!("{}_{}", package, method); + // Insert a use statment at the beginning of the test, + // ex: " use test_raftstore::new_node_cluster as new_cluster ", so we can use + // new_cluster in all situations. + item.block.stmts.insert( + 0, + syn::parse( + quote! { + use #package::{util::*, #method as new_cluster}; + } + .into(), + ) + .unwrap(), + ); + item.attrs.insert(0, parse_quote! { #[test] }); + let method_name = Ident::new(&test_name, item.sig.ident.span()); + item.sig.ident = method_name; + + rendered_test_cases.push(item.to_token_stream()); + } + + let mod_name = fn_item.sig.ident; + let output = quote! { + #[cfg(test)] + mod #mod_name { + #[allow(unused_imports)] + use super::*; + + #(#rendered_test_cases)* + } + }; + + output.into() +} + +// Parsing test case to get package name and method name. +// There are two cases that need to be considered +// 1. the first token is Ident type +// 2. the first token is Punct type +// +// use the following case as an example +// #[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore::new_server_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +// fn test_something() {} +// +// The first case ( #[test_case(test_raftstore::new_node_cluster)] ) +// will be passed to the proc-macro "test_case" as the first argument and the +// #[test_case(...)] will be stripped off automatically. So the first token is +// the Ident type, namely "test_raftstore". +// +// The other two cases are in the `attr` fileds of ItemFn, and +// #[test_case(...)] are untouched. So the first token is Punct type. +fn parse_test_case(test_case: TokenStream2) -> (Ident, Ident) { + let mut iter = test_case.into_iter(); + let package = match iter.next().unwrap() { + // ex: test_raftstore::new_node_cluster + TokenTree::Ident(package) => package, + // ex: #[test_raftstore::new_node_cluster] + TokenTree::Punct(_) => match iter.next().unwrap() { + TokenTree::Group(group) => { + let mut iter = group.stream().into_iter(); + iter.next(); + match iter.next().unwrap() { + TokenTree::Group(group) => { + let stream = group.stream(); + return parse_test_case(stream); + } + _ => panic!("Invalid token stream"), + } + } + _ => panic!("Invalid token stream"), + }, + _ => panic!("Invalid token stream"), + }; + // Skip two ':' + iter.next(); + iter.next(); + let method = match iter.next().unwrap() { + TokenTree::Ident(method) => method, + _ => panic!("Invalid token stream"), + }; + (package, method) +} diff --git a/components/test_sst_importer/Cargo.toml b/components/test_sst_importer/Cargo.toml index 71b8a69cf75..f951a6755e6 100644 --- a/components/test_sst_importer/Cargo.toml +++ b/components/test_sst_importer/Cargo.toml @@ -10,8 +10,8 @@ test = false [dependencies] crc32fast = "1.2" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -keys = { path = "../keys", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +keys = { workspace = true } +kvproto = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } diff --git a/components/test_sst_importer/src/lib.rs b/components/test_sst_importer/src/lib.rs index 9c9ef0496e9..2f8c195a6bf 100644 --- a/components/test_sst_importer/src/lib.rs +++ b/components/test_sst_importer/src/lib.rs @@ -3,12 +3,9 @@ use std::{collections::HashMap, fs, path::Path, sync::Arc}; use engine_rocks::{ - raw::{ - ColumnFamilyOptions, DBEntryType, DBOptions, Env, TablePropertiesCollector, - TablePropertiesCollectorFactory, - }, - raw_util::{new_engine, CFOptions}, - RocksEngine, RocksSstReader, RocksSstWriterBuilder, + raw::{DBEntryType, Env, TablePropertiesCollector, TablePropertiesCollectorFactory}, + util::new_engine_opt, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstReader, RocksSstWriterBuilder, }; pub use engine_rocks::{RocksEngine as TestEngine, RocksSstWriter}; use engine_traits::{KvEngine, SstWriter, SstWriterBuilder}; @@ -32,36 +29,35 @@ pub fn new_test_engine_with_options_and_env( env: Option>, ) -> RocksEngine where - F: FnMut(&str, &mut ColumnFamilyOptions), + F: FnMut(&str, &mut RocksCfOptions), { let cf_opts = cfs .iter() .map(|cf| { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = RocksCfOptions::default(); if let Some(ref env) = env { opt.set_env(env.clone()); } - apply(*cf, &mut opt); + apply(cf, &mut opt); opt.add_table_properties_collector_factory( "tikv.test_properties", TestPropertiesCollectorFactory::new(*cf), ); - CFOptions::new(*cf, opt) + (*cf, opt) }) .collect(); - let db_opts = env.map(|e| { - let mut opts = DBOptions::default(); + let db_opts = env.map_or_else(RocksDbOptions::default, |e| { + let mut opts = RocksDbOptions::default(); opts.set_env(e); opts }); - let db = new_engine(path, db_opts, cfs, Some(cf_opts)).expect("rocks test engine"); - RocksEngine::from_db(Arc::new(db)) + new_engine_opt(path, db_opts, cf_opts).expect("rocks test engine") } pub fn new_test_engine_with_options(path: &str, cfs: &[&str], apply: F) -> RocksEngine where - F: FnMut(&str, &mut ColumnFamilyOptions), + F: FnMut(&str, &mut RocksCfOptions), { new_test_engine_with_options_and_env(path, cfs, apply, None) } diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index 9a2c26aad22..b1172b5d559 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -21,13 +21,14 @@ test-engines-panic = [ ] [dependencies] -api_version = { path = "../api_version" } -collections = { path = "../collections" } +api_version = { workspace = true } +collections = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } -pd_client = { path = "../pd_client", default-features = false } -raftstore = { path = "../raftstore", default-features = false } -test_raftstore = { path = "../test_raftstore", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +kvproto = { workspace = true } +pd_client = { workspace = true } +raftstore = { workspace = true } +test_raftstore = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_storage/src/assert_storage.rs b/components/test_storage/src/assert_storage.rs index 7f057971785..3a641a322a2 100644 --- a/components/test_storage/src/assert_storage.rs +++ b/components/test_storage/src/assert_storage.rs @@ -1,7 +1,10 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use api_version::{ApiV1, KvFormat}; -use kvproto::kvrpcpb::{Context, KeyRange, LockInfo}; +use kvproto::{ + kvrpcpb::{Context, KeyRange, LockInfo}, + metapb, +}; use test_raftstore::{Cluster, ServerCluster, SimulateEngine}; use tikv::storage::{ self, @@ -27,7 +30,7 @@ impl Default for AssertionStorage { fn default() -> Self { AssertionStorage { ctx: Context::default(), - store: SyncTestStorageBuilder::default().build().unwrap(), + store: SyncTestStorageBuilder::default().build(0).unwrap(), } } } @@ -36,7 +39,7 @@ impl AssertionStorage { pub fn new() -> Self { AssertionStorage { ctx: Context::default(), - store: SyncTestStorageBuilder::new().build().unwrap(), + store: SyncTestStorageBuilder::new().build(0).unwrap(), } } } @@ -51,19 +54,27 @@ impl AssertionStorage { (cluster, storage) } - pub fn update_with_key_byte(&mut self, cluster: &mut Cluster, key: &[u8]) { + pub fn update_with_key_byte( + &mut self, + cluster: &mut Cluster, + key: &[u8], + ) -> metapb::Region { // ensure the leader of range which contains current key has been elected cluster.must_get(key); let region = cluster.get_region(key); let leader = cluster.leader_of_region(region.get_id()).unwrap(); if leader.get_store_id() == self.ctx.get_peer().get_store_id() { - return; + return region; } + let store_id = leader.store_id; let engine = cluster.sim.rl().storages[&leader.get_id()].clone(); self.ctx.set_region_id(region.get_id()); self.ctx.set_region_epoch(region.get_region_epoch().clone()); self.ctx.set_peer(leader); - self.store = SyncTestStorageBuilder::from_engine(engine).build().unwrap(); + self.store = SyncTestStorageBuilder::from_engine(engine) + .build(store_id) + .unwrap(); + region } pub fn delete_ok_for_cluster( @@ -173,7 +184,7 @@ impl AssertionStorage { break; } self.expect_not_leader_or_stale_command(res.unwrap_err()); - self.update_with_key_byte(cluster, key) + self.update_with_key_byte(cluster, key); } assert!(success); @@ -188,7 +199,7 @@ impl AssertionStorage { break; } self.expect_not_leader_or_stale_command(res.unwrap_err()); - self.update_with_key_byte(cluster, key) + self.update_with_key_byte(cluster, key); } assert!(success); } @@ -197,16 +208,17 @@ impl AssertionStorage { &mut self, cluster: &mut Cluster, region_key: &[u8], + mut region: metapb::Region, safe_point: impl Into, ) { let safe_point = safe_point.into(); for _ in 0..3 { - let ret = self.store.gc(self.ctx.clone(), safe_point); + let ret = self.store.gc(region, self.ctx.clone(), safe_point); if ret.is_ok() { return; } self.expect_not_leader_or_stale_command(ret.unwrap_err()); - self.update_with_key_byte(cluster, region_key); + region = self.update_with_key_byte(cluster, region_key); } panic!("failed with 3 retry!"); } @@ -224,7 +236,9 @@ impl AssertionStorage { self.delete_ok_for_cluster(cluster, &key, 1000, 1050); self.get_none_from_cluster(cluster, &key, 2000); - self.gc_ok_for_cluster(cluster, &key, 2000); + + let region = cluster.get_region(&key); + self.gc_ok_for_cluster(cluster, &key, region, 2000); self.get_none_from_cluster(cluster, &key, 3000); } } @@ -240,7 +254,9 @@ impl AssertionStorage { pub fn get_err(&self, key: &[u8], ts: impl Into) { let key = Key::from_raw(key); - assert!(self.store.get(self.ctx.clone(), &key, ts.into()).is_err()); + self.store + .get(self.ctx.clone(), &key, ts.into()) + .unwrap_err(); } pub fn get_ok(&self, key: &[u8], ts: impl Into, expect: &[u8]) { @@ -271,11 +287,9 @@ impl AssertionStorage { pub fn batch_get_err(&self, keys: &[&[u8]], ts: impl Into) { let keys: Vec = keys.iter().map(|x| Key::from_raw(x)).collect(); - assert!( - self.store - .batch_get(self.ctx.clone(), &keys, ts.into()) - .is_err() - ); + self.store + .batch_get(self.ctx.clone(), &keys, ts.into()) + .unwrap_err(); } pub fn batch_get_command_ok(&self, keys: &[&[u8]], ts: u64, expect: Vec<&[u8]>) { @@ -293,11 +307,9 @@ impl AssertionStorage { } pub fn batch_get_command_err(&self, keys: &[&[u8]], ts: u64) { - assert!( - self.store - .batch_get_command(self.ctx.clone(), keys, ts) - .is_err() - ); + self.store + .batch_get_command(self.ctx.clone(), keys, ts) + .unwrap_err(); } fn expect_not_leader_or_stale_command(&self, err: storage::Error) { @@ -332,7 +344,6 @@ impl AssertionStorage { ) where T: std::fmt::Debug, { - assert!(resp.is_err()); let err = resp.unwrap_err(); match err { StorageError(box StorageErrorInner::Txn(TxnError( @@ -384,16 +395,14 @@ impl AssertionStorage { _commit_ts: impl Into, ) { let start_ts = start_ts.into(); - assert!( - self.store - .prewrite( - self.ctx.clone(), - vec![Mutation::make_put(Key::from_raw(key), value.to_vec())], - key.to_vec(), - start_ts, - ) - .is_err() - ); + self.store + .prewrite( + self.ctx.clone(), + vec![Mutation::make_put(Key::from_raw(key), value.to_vec())], + key.to_vec(), + start_ts, + ) + .unwrap_err(); } pub fn delete_ok( @@ -683,16 +692,14 @@ impl AssertionStorage { start_ts: impl Into, current_ts: impl Into, ) { - assert!( - self.store - .cleanup( - self.ctx.clone(), - Key::from_raw(key), - start_ts.into(), - current_ts.into() - ) - .is_err() - ); + self.store + .cleanup( + self.ctx.clone(), + Key::from_raw(key), + start_ts.into(), + current_ts.into(), + ) + .unwrap_err(); } pub fn rollback_ok(&self, keys: Vec<&[u8]>, start_ts: impl Into) { @@ -704,11 +711,9 @@ impl AssertionStorage { pub fn rollback_err(&self, keys: Vec<&[u8]>, start_ts: impl Into) { let keys: Vec = keys.iter().map(|x| Key::from_raw(x)).collect(); - assert!( - self.store - .rollback(self.ctx.clone(), keys, start_ts.into()) - .is_err() - ); + self.store + .rollback(self.ctx.clone(), keys, start_ts.into()) + .unwrap_err(); } pub fn scan_locks_ok( @@ -802,8 +807,10 @@ impl AssertionStorage { self.expect_invalid_tso_err(resp, start_ts, commit_ts.unwrap()) } - pub fn gc_ok(&self, safe_point: impl Into) { - self.store.gc(self.ctx.clone(), safe_point.into()).unwrap(); + pub fn gc_ok(&self, region: metapb::Region, safe_point: impl Into) { + self.store + .gc(region, self.ctx.clone(), safe_point.into()) + .unwrap(); } pub fn delete_range_ok(&self, start_key: &[u8], end_key: &[u8]) { @@ -890,11 +897,9 @@ impl AssertionStorage { } pub fn raw_batch_get_command_err(&self, cf: String, keys: Vec>) { - assert!( - self.store - .raw_batch_get_command(self.ctx.clone(), cf, keys) - .is_err() - ); + self.store + .raw_batch_get_command(self.ctx.clone(), cf, keys) + .unwrap_err(); } pub fn raw_put_ok(&self, cf: String, key: Vec, value: Vec) { @@ -1080,11 +1085,11 @@ impl AssertionStorage { .unwrap_err(); } - pub fn test_txn_store_gc(&self, key: &str) { + pub fn test_txn_store_gc(&self, key: &str, region: metapb::Region) { let key_bytes = key.as_bytes(); self.put_ok(key_bytes, b"v1", 5, 10); self.put_ok(key_bytes, b"v2", 15, 20); - self.gc_ok(30); + self.gc_ok(region, 30); self.get_none(key_bytes, 15); self.get_ok(key_bytes, 25, b"v2"); } @@ -1097,7 +1102,7 @@ impl AssertionStorage { } self.delete_ok(&key, 1000, 1050); self.get_none(&key, 2000); - self.gc_ok(2000); + self.gc_ok(metapb::Region::default(), 2000); self.get_none(&key, 3000); } } diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index af8a079a4de..3d6e1e139e5 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -8,17 +8,21 @@ use std::{ use api_version::{ApiV1, KvFormat}; use collections::HashMap; use futures::executor::block_on; -use kvproto::kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}; -use raftstore::{coprocessor::RegionInfoProvider, router::RaftStoreBlackHole}; +use kvproto::{ + kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}, + metapb, +}; +use raftstore::coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}; use tikv::{ server::gc_worker::{AutoGcConfig, GcConfig, GcSafePointProvider, GcWorker}, storage::{ - config::Config, kv::RocksEngine, lock_manager::DummyLockManager, test_util::GetConsumer, + config::Config, kv::RocksEngine, lock_manager::MockLockManager, test_util::GetConsumer, txn::commands, Engine, KvGetStatistics, PrewriteResult, Result, Storage, TestEngineBuilder, TestStorageBuilder, TxnStatus, }, }; use tikv_util::time::Instant; +use tracker::INVALID_TRACKER_TOKEN; use txn_types::{Key, KvPair, Mutation, TimeStamp, Value}; /// A builder to build a `SyncTestStorage`. @@ -77,16 +81,20 @@ impl SyncTestStorageBuilder { self } - pub fn build(mut self) -> Result> { + pub fn build(mut self, store_id: u64) -> Result> { let mut builder = TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr( self.engine.clone(), - DummyLockManager, + MockLockManager::new(), ); if let Some(config) = self.config.take() { builder = builder.config(config); } builder = builder.set_api_version(F::TAG); - SyncTestStorage::from_storage(builder.build()?, self.gc_config.unwrap_or_default()) + SyncTestStorage::from_storage( + store_id, + builder.build()?, + self.gc_config.unwrap_or_default(), + ) } } @@ -95,8 +103,8 @@ impl SyncTestStorageBuilder { /// Only used for test purpose. #[derive(Clone)] pub struct SyncTestStorage { - gc_worker: GcWorker, - store: Storage, + gc_worker: GcWorker, + store: Storage, } /// SyncTestStorage for Api V1 @@ -105,18 +113,19 @@ pub type SyncTestStorageApiV1 = SyncTestStorage; impl SyncTestStorage { pub fn from_storage( - storage: Storage, + store_id: u64, + storage: Storage, config: GcConfig, ) -> Result { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( storage.get_engine(), - RaftStoreBlackHole, tx, config, Default::default(), + Arc::new(MockRegionInfoProvider::new(Vec::new())), ); - gc_worker.start()?; + gc_worker.start(store_id)?; Ok(Self { gc_worker, store: storage, @@ -132,7 +141,7 @@ impl SyncTestStorage { .unwrap(); } - pub fn get_storage(&self) -> Storage { + pub fn get_storage(&self) -> Storage { self.store.clone() } @@ -179,10 +188,11 @@ impl SyncTestStorage { req }) .collect(); + let trackers = keys.iter().map(|_| INVALID_TRACKER_TOKEN).collect(); let p = GetConsumer::new(); block_on( self.store - .batch_get_command(requests, ids, p.clone(), Instant::now()), + .batch_get_command(requests, ids, trackers, p.clone(), Instant::now()), )?; let mut values = vec![]; for value in p.take_data().into_iter() { @@ -332,8 +342,13 @@ impl SyncTestStorage { .unwrap() } - pub fn gc(&self, _: Context, safe_point: impl Into) -> Result<()> { - wait_op!(|cb| self.gc_worker.gc(safe_point.into(), cb)).unwrap() + pub fn gc( + &self, + region: metapb::Region, + _: Context, + safe_point: impl Into, + ) -> Result<()> { + wait_op!(|cb| self.gc_worker.gc(region, safe_point.into(), cb)).unwrap() } pub fn delete_range( diff --git a/components/test_storage/src/util.rs b/components/test_storage/src/util.rs index 62b46ffd082..032fe24c60c 100644 --- a/components/test_storage/src/util.rs +++ b/components/test_storage/src/util.rs @@ -36,7 +36,9 @@ pub fn new_raft_storage_with_store_count( let (cluster, engine, ctx) = new_raft_engine(count, key); ( cluster, - SyncTestStorageBuilder::from_engine(engine).build().unwrap(), + SyncTestStorageBuilder::from_engine(engine) + .build(ctx.peer.as_ref().unwrap().store_id) + .unwrap(), ctx, ) } diff --git a/components/test_util/Cargo.toml b/components/test_util/Cargo.toml index c5dc5dfd1d2..64dbb2456ce 100644 --- a/components/test_util/Cargo.toml +++ b/components/test_util/Cargo.toml @@ -12,16 +12,16 @@ cloud-azure = ["encryption_export/cloud-azure"] [dependencies] backtrace = "0.3" -collections = { path = "../collections" } -encryption_export = { path = "../encryption/export", default-features = false } +collections = { workspace = true } +encryption_export = { workspace = true } fail = "0.5" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +grpcio = { workspace = true } +kvproto = { workspace = true } rand = "0.8" rand_isaac = "0.3" -security = { path = "../security", default-features = false } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +security = { workspace = true } +slog = { workspace = true } +slog-global = { workspace = true } tempfile = "3.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } time = "0.1" diff --git a/components/test_util/src/encryption.rs b/components/test_util/src/encryption.rs index ba6ab56cc52..e09c0ce7cbb 100644 --- a/components/test_util/src/encryption.rs +++ b/components/test_util/src/encryption.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fs::File, io::Write, time::Duration}; +use std::{fs::File, io::Write, path::Path, time::Duration}; use encryption_export::{ create_backend, DataKeyManager, DataKeyManagerArgs, EncryptionConfig, FileConfig, @@ -15,15 +15,15 @@ pub fn create_test_key_file(path: &str) { .unwrap(); } -fn new_test_file_master_key(tmp: &tempfile::TempDir) -> MasterKeyConfig { - let key_path = tmp.path().join("test_key").to_str().unwrap().to_owned(); +fn new_test_file_master_key(tmp: &Path) -> MasterKeyConfig { + let key_path = tmp.join("test_key").to_str().unwrap().to_owned(); create_test_key_file(&key_path); MasterKeyConfig::File { config: FileConfig { path: key_path }, } } -pub fn new_file_security_config(dir: &tempfile::TempDir) -> EncryptionConfig { +pub fn new_file_security_config(dir: &Path) -> EncryptionConfig { let master_key_cfg = new_test_file_master_key(dir); EncryptionConfig { data_encryption_method: EncryptionMethod::Aes256Ctr, @@ -41,7 +41,7 @@ pub fn new_test_key_manager( master_key: Option, previous_master_key: Option, ) -> Result> { - let default_config = new_test_file_master_key(tmp_dir); + let default_config = new_test_file_master_key(tmp_dir.path()); let master_key = master_key.unwrap_or_else(|| default_config.clone()); let previous_master_key = previous_master_key.unwrap_or(default_config); DataKeyManager::new( @@ -52,7 +52,7 @@ pub fn new_test_key_manager( rotation_period: Duration::from_secs(60), enable_file_dictionary_log: true, file_dictionary_rewrite_threshold: 2, - dict_path: tmp_dir.path().as_os_str().to_str().unwrap().to_string(), + dict_path: tmp_dir.path().to_str().unwrap().to_string(), }, ) } diff --git a/components/test_util/src/lib.rs b/components/test_util/src/lib.rs index 9dca2ee2111..453ed7fb7f1 100644 --- a/components/test_util/src/lib.rs +++ b/components/test_util/src/lib.rs @@ -15,11 +15,13 @@ mod security; use std::{ env, + fmt::Debug, sync::atomic::{AtomicU16, Ordering}, thread, }; use rand::Rng; +use tikv_util::sys::thread::StdThreadBuildWrapper; pub use crate::{ encryption::*, @@ -31,12 +33,12 @@ pub use crate::{ }; pub fn setup_for_ci() { - // We use backtrace in tests to record suspicious problems. And loading backtrace - // the first time can take several seconds. Spawning a thread and load it ahead - // of time to avoid causing timeout. + // We use backtrace in tests to record suspicious problems. And loading + // backtrace the first time can take several seconds. Spawning a thread and + // load it ahead of time to avoid causing timeout. thread::Builder::new() .name(tikv_util::thd_name!("backtrace-loader")) - .spawn(::backtrace::Backtrace::new) + .spawn_wrapper(::backtrace::Backtrace::new) .unwrap(); if env::var("CI").is_ok() { @@ -117,3 +119,38 @@ pub fn temp_dir(prefix: impl Into>, prefer_mem: bool) -> te _ => builder.tempdir().unwrap(), } } + +/// Compare two structs and provide more helpful debug difference. +#[track_caller] +pub fn assert_eq_debug(lhs: &C, rhs: &C) { + if lhs == rhs { + return; + } + let lhs_str = format!("{:?}", lhs); + let rhs_str = format!("{:?}", rhs); + + fn find_index(l: impl Iterator) -> usize { + let it = l + .enumerate() + .take_while(|(_, (l, r))| l == r) + .filter(|(_, (l, _))| *l == b' '); + let mut last = None; + let mut second = None; + for a in it { + second = last; + last = Some(a); + } + second.map_or(0, |(i, _)| i) + } + let cpl = find_index(lhs_str.bytes().zip(rhs_str.bytes())); + let csl = find_index(lhs_str.bytes().rev().zip(rhs_str.bytes().rev())); + if cpl + csl > lhs_str.len() || cpl + csl > rhs_str.len() { + assert_eq!(lhs, rhs); + } + let lhs_diff = String::from_utf8_lossy(&lhs_str.as_bytes()[cpl..lhs_str.len() - csl]); + let rhs_diff = String::from_utf8_lossy(&rhs_str.as_bytes()[cpl..rhs_str.len() - csl]); + panic!( + "config not matched:\nlhs: ...{}...,\nrhs: ...{}...", + lhs_diff, rhs_diff + ); +} diff --git a/components/test_util/src/runner.rs b/components/test_util/src/runner.rs index e3d6cad5979..ee2b6548c23 100644 --- a/components/test_util/src/runner.rs +++ b/components/test_util/src/runner.rs @@ -57,15 +57,15 @@ pub fn run_test_with_hook(cases: &[&TestDescAndFn], hook: impl TestHook + Send + .iter() .map(|case| { let name = case.desc.name.as_slice().to_owned(); - let h = hook.clone(); + let hook = hook.clone(); let f = match case.testfn { TestFn::StaticTestFn(f) => TestFn::DynTestFn(Box::new(move || { - let _watcher = CaseLifeWatcher::new(name, h); - f(); + let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); + f() })), - TestFn::StaticBenchFn(f) => TestFn::DynTestFn(Box::new(move || { - let _watcher = CaseLifeWatcher::new(name, h); - bench::run_once(move |b| f(b)); + TestFn::StaticBenchFn(f) => TestFn::DynBenchFn(Box::new(move |b| { + let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); + f(b) })), ref f => panic!("unexpected testfn {:?}", f), }; @@ -99,9 +99,9 @@ impl TestHook for FailpointHook { } } -/// During panic, due to drop order, failpoints will not be cleared before tests exit. -/// If tests wait for a sleep failpoint, the whole tests will hang. So we need a method -/// to clear failpoints explicitly besides teardown. +/// During panic, due to drop order, failpoints will not be cleared before tests +/// exit. If tests wait for a sleep failpoint, the whole tests will hang. So we +/// need a method to clear failpoints explicitly besides teardown. pub fn clear_failpoints() { FS.with(|s| s.borrow_mut().take()); } diff --git a/components/tidb_query_aggr/Cargo.toml b/components/tidb_query_aggr/Cargo.toml index 71025327e9a..facc9d32f36 100644 --- a/components/tidb_query_aggr/Cargo.toml +++ b/components/tidb_query_aggr/Cargo.toml @@ -6,14 +6,14 @@ publish = false description = "Vector aggr functions of query engine to run TiDB pushed down executors" [dependencies] -match_template = { path = "../match_template" } -tidb_query_codegen = { path = "../tidb_query_codegen" } -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tidb_query_expr = { path = "../tidb_query_expr", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } -tipb = { git = "https://github.com/pingcap/tipb.git" } +match-template = "0.0.1" +tidb_query_codegen = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tidb_query_expr = { workspace = true } +tikv_util = { workspace = true } +tipb = { workspace = true } [dev-dependencies] -panic_hook = { path = "../panic_hook" } -tipb_helper = { path = "../tipb_helper", default-features = false } +panic_hook = { workspace = true } +tipb_helper = { workspace = true } diff --git a/components/tidb_query_aggr/src/impl_avg.rs b/components/tidb_query_aggr/src/impl_avg.rs index ec4784b24e4..6337c8de6c5 100644 --- a/components/tidb_query_aggr/src/impl_avg.rs +++ b/components/tidb_query_aggr/src/impl_avg.rs @@ -73,7 +73,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserAvg { /// The AVG aggregate function. /// -/// Note that there are `AVG(Decimal) -> (Int, Decimal)` and `AVG(Double) -> (Int, Double)`. +/// Note that there are `AVG(Decimal) -> (Int, Decimal)` and `AVG(Double) -> +/// (Int, Double)`. #[derive(Debug, AggrFunction)] #[aggr_function(state = AggrFnStateAvg::::new())] pub struct AggrFnAvg diff --git a/components/tidb_query_aggr/src/impl_count.rs b/components/tidb_query_aggr/src/impl_count.rs index 0e17f1adfb6..3d49d8b25af 100644 --- a/components/tidb_query_aggr/src/impl_count.rs +++ b/components/tidb_query_aggr/src/impl_count.rs @@ -111,9 +111,10 @@ impl AggrFnStateCount { } } -// Here we manually implement `AggrFunctionStateUpdatePartial` so that `update_repeat` and -// `update_vector` can be faster. Also note that we support all kind of -// `AggrFunctionStateUpdatePartial` for the COUNT aggregate function. +// Here we manually implement `AggrFunctionStateUpdatePartial` so that +// `update_repeat` and `update_vector` can be faster. Also note that we support +// all kind of `AggrFunctionStateUpdatePartial` for the COUNT aggregate +// function. impl super::AggrFunctionStateUpdatePartial for AggrFnStateCount where diff --git a/components/tidb_query_aggr/src/impl_first.rs b/components/tidb_query_aggr/src/impl_first.rs index f01546cc5ef..b7ccd077598 100644 --- a/components/tidb_query_aggr/src/impl_first.rs +++ b/components/tidb_query_aggr/src/impl_first.rs @@ -155,19 +155,22 @@ where } } -// Here we manually implement `AggrFunctionStateUpdatePartial` instead of implementing -// `ConcreteAggrFunctionState` so that `update_repeat` and `update_vector` can be faster. +// Here we manually implement `AggrFunctionStateUpdatePartial` instead of +// implementing `ConcreteAggrFunctionState` so that `update_repeat` and +// `update_vector` can be faster. impl super::AggrFunctionStateUpdatePartial for AggrFnStateFirst where T: EvaluableRef<'static> + 'static, VectorValue: VectorValueExt, { - // ChunkedType has been implemented in AggrFunctionStateUpdatePartial for AggrFnStateFirst + // ChunkedType has been implemented in AggrFunctionStateUpdatePartial for + // AggrFnStateFirst impl_state_update_partial! { T } } -// In order to make `AggrFnStateFirst` satisfy the `AggrFunctionState` trait, we default impl all -// `AggrFunctionStateUpdatePartial` of `Evaluable` for all `AggrFnStateFirst`. +// In order to make `AggrFnStateFirst` satisfy the `AggrFunctionState` trait, we +// default impl all `AggrFunctionStateUpdatePartial` of `Evaluable` for all +// `AggrFnStateFirst`. impl_unmatched_function_state! { AggrFnStateFirst } impl super::AggrFunctionState for AggrFnStateFirst diff --git a/components/tidb_query_aggr/src/impl_max_min.rs b/components/tidb_query_aggr/src/impl_max_min.rs index 49eb4d911b8..c18710b3645 100644 --- a/components/tidb_query_aggr/src/impl_max_min.rs +++ b/components/tidb_query_aggr/src/impl_max_min.rs @@ -242,9 +242,9 @@ where /// # Notes /// - /// For MAX(), MySQL currently compares ENUM and SET columns by their string value rather - /// than by the string's relative position in the set. This differs from how ORDER BY - /// compares them. + /// For MAX(), MySQL currently compares ENUM and SET columns by their string + /// value rather than by the string's relative position in the set. This + /// differs from how ORDER BY compares them. /// /// ref: https://dev.mysql.com/doc/refman/5.7/en/aggregate-functions.html#function_max #[inline] @@ -331,9 +331,9 @@ where /// # Notes /// - /// For MAX(), MySQL currently compares ENUM and SET columns by their string value rather - /// than by the string's relative position in the set. This differs from how ORDER BY - /// compares them. + /// For MAX(), MySQL currently compares ENUM and SET columns by their string + /// value rather than by the string's relative position in the set. This + /// differs from how ORDER BY compares them. /// /// ref: https://dev.mysql.com/doc/refman/5.7/en/aggregate-functions.html#function_max #[inline] @@ -514,10 +514,10 @@ where self.extremum = value.copied() } } else { - let v1 = self.extremum.map(|x| x as i64); - let v2 = value.map(|x| *x as i64); + let v1: Option = self.extremum; + let v2: Option = value.copied(); if v1.cmp(&v2) == E::ORD { - self.extremum = value.copied() + self.extremum = v2; } } } @@ -937,7 +937,7 @@ mod tests { min_state.push_result(&mut ctx, &mut aggr_result).unwrap(); } - assert_eq!(aggr_result[0].to_int_vec(), &(*expected_res)); + assert_eq!(aggr_result[0].to_int_vec(), expected_res); } #[test] diff --git a/components/tidb_query_aggr/src/impl_sum.rs b/components/tidb_query_aggr/src/impl_sum.rs index 5b0e8334e86..85f31b8f459 100644 --- a/components/tidb_query_aggr/src/impl_sum.rs +++ b/components/tidb_query_aggr/src/impl_sum.rs @@ -52,7 +52,8 @@ impl super::parser::AggrDefinitionParser for AggrFnDefinitionParserSum { out_schema.push(out_ft); out_exp.push(exp); - // Choose a type-aware SUM implementation based on the eval type after rewriting exp. + // Choose a type-aware SUM implementation based on the eval type after rewriting + // exp. Ok(match rewritten_eval_type { EvalType::Decimal => Box::new(AggrFnSum::::new()), EvalType::Real => Box::new(AggrFnSum::::new()), @@ -190,8 +191,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() that expect a numeric argument cast the argument to a - /// number if necessary. For ENUM values, the index number is used in the calculation. + /// Functions such as SUM() or AVG() that expect a numeric argument cast the + /// argument to a number if necessary. For ENUM values, the index number is + /// used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] @@ -266,8 +268,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() that expect a numeric argument cast the argument to a - /// number if necessary. For ENUM values, the index number is used in the calculation. + /// Functions such as SUM() or AVG() that expect a numeric argument cast the + /// argument to a number if necessary. For ENUM values, the index number is + /// used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] diff --git a/components/tidb_query_aggr/src/impl_variance.rs b/components/tidb_query_aggr/src/impl_variance.rs index f5b7fcc3bc8..190446c3809 100644 --- a/components/tidb_query_aggr/src/impl_variance.rs +++ b/components/tidb_query_aggr/src/impl_variance.rs @@ -80,7 +80,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserVari let out_ft = root_expr.take_field_type(); let out_et = box_try!(EvalType::try_from(out_ft.as_accessor().tp())); - // Rewrite expression to insert CAST() if needed. The rewrite should always succeed. + // Rewrite expression to insert CAST() if needed. The rewrite should always + // succeed. super::util::rewrite_exp_for_sum_avg(src_schema, &mut exp).unwrap(); let rewritten_eval_type = @@ -103,7 +104,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserVari out_schema.push(out_ft); out_exp.push(exp); - // Choose a type-aware VARIANCE implementation based on the eval type after rewriting exp. + // Choose a type-aware VARIANCE implementation based on the eval type after + // rewriting exp. Ok(match rewritten_eval_type { EvalType::Decimal => Box::new(AggrFnVariance::::new()), EvalType::Real => Box::new(AggrFnVariance::::new()), @@ -117,7 +119,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserVari /// The VARIANCE aggregate function. /// -/// Note that there are `VARIANCE(Decimal) -> Decimal` and `VARIANCE(Double) -> Double`. +/// Note that there are `VARIANCE(Decimal) -> Decimal` and `VARIANCE(Double) -> +/// Double`. #[derive(Debug, AggrFunction)] #[aggr_function(state = AggrFnStateVariance::::new())] pub struct AggrFnVariance @@ -276,9 +279,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric argument cast the - /// argument to a number if necessary. For ENUM values, the index number is used in the - /// calculation. + /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric + /// argument cast the argument to a number if necessary. For ENUM values, + /// the index number is used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] @@ -387,9 +390,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric argument cast the - /// argument to a number if necessary. For ENUM values, the index number is used in the - /// calculation. + /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric + /// argument cast the argument to a number if necessary. For ENUM values, + /// the index number is used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] diff --git a/components/tidb_query_aggr/src/lib.rs b/components/tidb_query_aggr/src/lib.rs index 65b2da55d03..c6ddfb96d2f 100644 --- a/components/tidb_query_aggr/src/lib.rs +++ b/components/tidb_query_aggr/src/lib.rs @@ -30,16 +30,18 @@ pub use self::parser::{AggrDefinitionParser, AllAggrDefinitionParser}; /// A trait for all single parameter aggregate functions. /// -/// Unlike ordinary function, aggregate function calculates a summary value over multiple rows. To -/// save memory, this functionality is provided via an incremental update model: +/// Unlike ordinary function, aggregate function calculates a summary value over +/// multiple rows. To save memory, this functionality is provided via an +/// incremental update model: /// -/// 1. Each aggregate function associates a state structure, storing partially computed aggregate -/// results. +/// - Each aggregate function associates a state structure, storing partially +/// computed aggregate results. /// -/// 2. The caller calls `update()` or `update_vector()` for each row to update the state. +/// - The caller calls `update()` or `update_vector()` for each row to update +/// the state. /// -/// 3. The caller finally calls `push_result()` to aggregate a summary value and push it into the -/// given data container. +/// - The caller finally calls `push_result()` to aggregate a summary value and +/// push it into the given data container. /// /// This trait can be auto derived by using `tidb_query_codegen::AggrFunction`. pub trait AggrFunction: std::fmt::Debug + Send + 'static { @@ -52,13 +54,15 @@ pub trait AggrFunction: std::fmt::Debug + Send + 'static { /// A trait for all single parameter aggregate function states. /// -/// Aggregate function states are created by corresponding aggregate functions. For each state, -/// it can be updated or aggregated (to finalize a result) independently. +/// Aggregate function states are created by corresponding aggregate functions. +/// For each state, it can be updated or aggregated (to finalize a result) +/// independently. /// -/// Note that aggregate function states are strongly typed, that is, the caller must provide the -/// parameter in the correct data type for an aggregate function states that calculates over this -/// data type. To be safely boxed and placed in a vector, interfaces are provided in a form that -/// accept all kinds of data type. However, unmatched types will result in panics in runtime. +/// Note that aggregate function states are strongly typed, that is, the caller +/// must provide the parameter in the correct data type for an aggregate +/// function states that calculates over this data type. To be safely boxed and +/// placed in a vector, interfaces are provided in a form that accept all kinds +/// of data type. However, unmatched types will result in panics in runtime. pub trait AggrFunctionState: std::fmt::Debug + Send @@ -73,17 +77,19 @@ pub trait AggrFunctionState: + AggrFunctionStateUpdatePartial> + AggrFunctionStateUpdatePartial> { - // TODO: A better implementation is to specialize different push result targets. However - // current aggregation executor cannot utilize it. + // TODO: A better implementation is to specialize different push result targets. + // However current aggregation executor cannot utilize it. fn push_result(&self, ctx: &mut EvalContext, target: &mut [VectorValue]) -> Result<()>; } -/// A helper trait for single parameter aggregate function states that only work over concrete eval -/// types. This is the actual and only trait that normal aggregate function states will implement. +/// A helper trait for single parameter aggregate function states that only work +/// over concrete eval types. This is the actual and only trait that normal +/// aggregate function states will implement. /// -/// Unlike `AggrFunctionState`, this trait only provides specialized `update()` and `push_result()` -/// functions according to the associated type. `update()` and `push_result()` functions that accept -/// any eval types (but will panic when eval type does not match expectation) will be generated via +/// Unlike `AggrFunctionState`, this trait only provides specialized `update()` +/// and `push_result()` functions according to the associated type. `update()` +/// and `push_result()` functions that accept any eval types (but will panic +/// when eval type does not match expectation) will be generated via /// implementations over this trait. pub trait ConcreteAggrFunctionState: std::fmt::Debug + Send + 'static { type ParameterType: EvaluableRef<'static>; @@ -102,14 +108,14 @@ pub trait ConcreteAggrFunctionState: std::fmt::Debug + Send + 'static { #[macro_export] macro_rules! update_concrete { - ( $state:expr, $ctx:expr, $value:expr ) => { + ($state:expr, $ctx:expr, $value:expr) => { unsafe { $state.update_concrete_unsafe($ctx, $value.unsafe_into()) } }; } #[macro_export] macro_rules! update_vector { - ( $state:expr, $ctx:expr, $physical_values:expr, $logical_rows:expr ) => { + ($state:expr, $ctx:expr, $physical_values:expr, $logical_rows:expr) => { unsafe { $state.update_vector_unsafe( $ctx, @@ -123,21 +129,21 @@ macro_rules! update_vector { #[macro_export] macro_rules! update_repeat { - ( $state:expr, $ctx:expr, $value:expr, $repeat_times:expr ) => { + ($state:expr, $ctx:expr, $value:expr, $repeat_times:expr) => { unsafe { $state.update_repeat_unsafe($ctx, $value.unsafe_into(), $repeat_times) } }; } #[macro_export] macro_rules! update { - ( $state:expr, $ctx:expr, $value:expr ) => { + ($state:expr, $ctx:expr, $value:expr) => { unsafe { $state.update_unsafe($ctx, $value.unsafe_into()) } }; } #[macro_export] macro_rules! impl_state_update_partial { - ( $ty:tt ) => { + ($ty:tt) => { #[inline] unsafe fn update_unsafe( &mut self, @@ -172,7 +178,7 @@ macro_rules! impl_state_update_partial { #[macro_export] macro_rules! impl_concrete_state { - ( $ty:ty ) => { + ($ty:ty) => { #[inline] unsafe fn update_concrete_unsafe( &mut self, @@ -186,7 +192,7 @@ macro_rules! impl_concrete_state { #[macro_export] macro_rules! impl_unmatched_function_state { - ( $ty:ty ) => { + ($ty:ty) => { impl super::AggrFunctionStateUpdatePartial for $ty where T1: EvaluableRef<'static> + 'static, @@ -226,15 +232,15 @@ macro_rules! impl_unmatched_function_state { }; } -/// A helper trait that provides `update()` and `update_vector()` over a concrete type, which will -/// be relied in `AggrFunctionState`. +/// A helper trait that provides `update()` and `update_vector()` over a +/// concrete type, which will be relied in `AggrFunctionState`. pub trait AggrFunctionStateUpdatePartial> { /// Updates the internal state giving one row data. /// /// # Panics /// - /// Panics if the aggregate function does not support the supplied concrete data type as its - /// parameter. + /// Panics if the aggregate function does not support the supplied concrete + /// data type as its parameter. /// /// # Safety /// @@ -245,8 +251,8 @@ pub trait AggrFunctionStateUpdatePartial> { /// /// # Panics /// - /// Panics if the aggregate function does not support the supplied concrete data type as its - /// parameter. + /// Panics if the aggregate function does not support the supplied concrete + /// data type as its parameter. /// /// # Safety /// @@ -262,8 +268,8 @@ pub trait AggrFunctionStateUpdatePartial> { /// /// # Panics /// - /// Panics if the aggregate function does not support the supplied concrete data type as its - /// parameter. + /// Panics if the aggregate function does not support the supplied concrete + /// data type as its parameter. /// /// # Safety /// @@ -281,8 +287,9 @@ impl, State> AggrFunctionStateUpdatePartial for Stat where State: ConcreteAggrFunctionState, { - // All `ConcreteAggrFunctionState` implement `AggrFunctionStateUpdatePartial`, which is - // one of the trait bound that `AggrFunctionState` requires. + // All `ConcreteAggrFunctionState` implement + // `AggrFunctionStateUpdatePartial`, which is one of the trait bound that + // `AggrFunctionState` requires. #[inline] default unsafe fn update_unsafe( @@ -409,22 +416,18 @@ mod tests { let mut s = AggrFnStateFoo::new(); // Update using `Int` should success. - assert!( - update!( - &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, - &mut ctx, - Some(&1) - ) - .is_ok() - ); - assert!( - update!( - &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, - &mut ctx, - Some(&3) - ) - .is_ok() - ); + update!( + &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, + &mut ctx, + Some(&1) + ) + .unwrap(); + update!( + &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, + &mut ctx, + Some(&3) + ) + .unwrap(); // Update using other data type should panic. let result = panic_hook::recover_safe(|| { @@ -435,7 +438,7 @@ mod tests { Real::new(1.0).ok().as_ref() ); }); - assert!(result.is_err()); + result.unwrap_err(); let result = panic_hook::recover_safe(|| { let mut s = s.clone(); @@ -445,32 +448,26 @@ mod tests { Some(&[1u8] as BytesRef<'_>) ); }); - assert!(result.is_err()); + result.unwrap_err(); // Push result to Real VectorValue should success. let mut target = vec![VectorValue::with_capacity(0, EvalType::Real)]; - assert!( - (&mut s as &mut dyn AggrFunctionState) - .push_result(&mut ctx, &mut target) - .is_ok() - ); + (&mut s as &mut dyn AggrFunctionState) + .push_result(&mut ctx, &mut target) + .unwrap(); assert_eq!(target[0].to_real_vec(), &[Real::new(4.0).ok()]); // Calling push result multiple times should also success. - assert!( - update!( - &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, - &mut ctx, - Some(&1) - ) - .is_ok() - ); - assert!( - (&mut s as &mut dyn AggrFunctionState) - .push_result(&mut ctx, &mut target) - .is_ok() - ); + update!( + &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, + &mut ctx, + Some(&1) + ) + .unwrap(); + (&mut s as &mut dyn AggrFunctionState) + .push_result(&mut ctx, &mut target) + .unwrap(); assert_eq!( target[0].to_real_vec(), &[Real::new(4.0).ok(), Real::new(5.0).ok()] @@ -482,13 +479,13 @@ mod tests { let mut target: Vec = Vec::new(); let _ = (&mut s as &mut dyn AggrFunctionState).push_result(&mut ctx, &mut target[..]); }); - assert!(result.is_err()); + result.unwrap_err(); let result = panic_hook::recover_safe(|| { let mut s = s.clone(); let mut target: Vec = vec![VectorValue::with_capacity(0, EvalType::Int)]; let _ = (&mut s as &mut dyn AggrFunctionState).push_result(&mut ctx, &mut target[..]); }); - assert!(result.is_err()); + result.unwrap_err(); } } diff --git a/components/tidb_query_aggr/src/parser.rs b/components/tidb_query_aggr/src/parser.rs index 5cbc19961d8..600326edb2f 100644 --- a/components/tidb_query_aggr/src/parser.rs +++ b/components/tidb_query_aggr/src/parser.rs @@ -9,26 +9,29 @@ use crate::{impl_bit_op::*, impl_max_min::*, impl_variance::*, AggrFunction}; /// Parse a specific aggregate function definition from protobuf. /// -/// All aggregate function implementations should include an impl for this trait as well as -/// add a match arm in `map_pb_sig_to_aggr_func_parser` so that the aggregate function can be -/// actually utilized. +/// All aggregate function implementations should include an impl for this trait +/// as well as add a match arm in `map_pb_sig_to_aggr_func_parser` so that the +/// aggregate function can be actually utilized. pub trait AggrDefinitionParser { - /// Checks whether the inner expression of the aggregate function definition is supported. - /// It is ensured that `aggr_def.tp` maps the current parser instance. + /// Checks whether the inner expression of the aggregate function definition + /// is supported. It is ensured that `aggr_def.tp` maps the current + /// parser instance. fn check_supported(&self, aggr_def: &Expr) -> Result<()>; /// Parses and transforms the aggregate function definition. /// - /// The schema of this aggregate function will be appended in `out_schema` and the final - /// RPN expression (maybe wrapped by some casting according to types) will be appended in - /// `out_exp`. + /// The schema of this aggregate function will be appended in `out_schema` + /// and the final RPN expression (maybe wrapped by some casting + /// according to types) will be appended in `out_exp`. /// - /// The parser may choose particular aggregate function implementation based on the data - /// type, so `schema` is also needed in case of data type depending on the column. + /// The parser may choose particular aggregate function implementation based + /// on the data type, so `schema` is also needed in case of data type + /// depending on the column. /// /// # Panic /// - /// May panic if the aggregate function definition is not supported by this parser. + /// May panic if the aggregate function definition is not supported by this + /// parser. fn parse( &self, mut aggr_def: Expr, @@ -100,8 +103,8 @@ impl AggrDefinitionParser for AllAggrDefinitionParser { }) } - /// Parses and transforms the aggregate function definition to generate corresponding - /// `AggrFunction` instance. + /// Parses and transforms the aggregate function definition to generate + /// corresponding `AggrFunction` instance. /// /// # Panic /// diff --git a/components/tidb_query_aggr/src/util.rs b/components/tidb_query_aggr/src/util.rs index 0e9ae390cf1..c4ba7a05766 100644 --- a/components/tidb_query_aggr/src/util.rs +++ b/components/tidb_query_aggr/src/util.rs @@ -7,7 +7,8 @@ use tidb_query_datatype::{builder::FieldTypeBuilder, EvalType, FieldTypeAccessor use tidb_query_expr::{impl_cast::get_cast_fn_rpn_node, RpnExpression, RpnExpressionBuilder}; use tipb::{Expr, FieldType}; -/// Checks whether or not there is only one child and the child expression is supported. +/// Checks whether or not there is only one child and the child expression is +/// supported. pub fn check_aggr_exp_supported_one_child(aggr_def: &Expr) -> Result<()> { if aggr_def.get_children().len() != 1 { return Err(other_err!( @@ -23,7 +24,8 @@ pub fn check_aggr_exp_supported_one_child(aggr_def: &Expr) -> Result<()> { Ok(()) } -/// Rewrites the expression to insert necessary cast functions for SUM and AVG aggregate functions. +/// Rewrites the expression to insert necessary cast functions for SUM and AVG +/// aggregate functions. /// /// See `typeInfer4Sum` and `typeInfer4Avg` in TiDB. /// @@ -63,7 +65,8 @@ pub fn rewrite_exp_for_sum_avg(schema: &[FieldType], exp: &mut RpnExpression) -> Ok(()) } -/// Rewrites the expression to insert necessary cast functions for Bit operation family functions. +/// Rewrites the expression to insert necessary cast functions for Bit operation +/// family functions. pub fn rewrite_exp_for_bit_op(schema: &[FieldType], exp: &mut RpnExpression) -> Result<()> { let ret_field_type = exp.ret_field_type(schema); let ret_eval_type = box_try!(EvalType::try_from(ret_field_type.as_accessor().tp())); diff --git a/components/tidb_query_codegen/src/lib.rs b/components/tidb_query_codegen/src/lib.rs index baa9d8522ab..feee1c6afb3 100644 --- a/components/tidb_query_codegen/src/lib.rs +++ b/components/tidb_query_codegen/src/lib.rs @@ -8,8 +8,8 @@ //! //! This crate exports a custom derive for [`AggrFunction`](https://github.com/tikv/tikv/blob/master/components/tidb_query_aggr/src/mod.rs) //! and an attribute macro called `rpn_fn` for use on functions which provide -//! coprocessor functionality. `rpn_fn` is documented in the [rpn_function](rpn_function.rs) -//! module. +//! coprocessor functionality. `rpn_fn` is documented in the +//! [rpn_function](rpn_function.rs) module. #![feature(proc_macro_diagnostic)] #![feature(iter_order_by)] diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index 8025fc01588..dfdede3a3b3 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -16,13 +16,13 @@ //! ## Arguments to macro //! //! If neither `varg` or `raw_varg` are supplied, then the generated arguments -//! follow from the supplied function's arguments. Each argument must have a type -//! `Option<&T>` for some `T`. +//! follow from the supplied function's arguments. Each argument must have a +//! type `Option<&T>` for some `T`. //! //! ### `varg` //! -//! The RPN operator takes a variable number of arguments. The arguments are passed -//! as a `&[Option<&T>]`. E.g., +//! The RPN operator takes a variable number of arguments. The arguments are +//! passed as a `&[Option<&T>]`. E.g., //! //! ```ignore //! #[rpn_fn(varg)] @@ -33,8 +33,8 @@ //! //! ### `raw_varg` //! -//! The RPN operator takes a variable number of arguments. The arguments are passed -//! as a `&[ScalarValueRef]`. E.g., +//! The RPN operator takes a variable number of arguments. The arguments are +//! passed as a `&[ScalarValueRef]`. E.g., //! //! ```ignore //! #[rpn_fn(raw_varg)] @@ -43,8 +43,8 @@ //! } //! ``` //! -//! Use `raw_varg` where the function takes a variable number of arguments and the types -//! are not the same, for example, RPN function `case_when`. +//! Use `raw_varg` where the function takes a variable number of arguments and +//! the types are not the same, for example, RPN function `case_when`. //! //! ### `max_args` //! @@ -61,34 +61,40 @@ //! ### `extra_validator` //! //! A function name for custom validation code to be run when an operation is -//! validated. The validator function should have the signature `&tipb::Expr -> Result<()>`. -//! E.g., `#[rpn_fn(raw_varg, extra_validator = json_object_validator)]` +//! validated. The validator function should have the signature `&tipb::Expr -> +//! Result<()>`. E.g., `#[rpn_fn(raw_varg, extra_validator = +//! json_object_validator)]` //! //! ### `metadata_type` //! //! The type of the metadata structure defined in tipb. -//! If `metadata_mapper` is not specified, the protobuf metadata structure will be used as the metadata directly. +//! If `metadata_mapper` is not specified, the protobuf metadata structure will +//! be used as the metadata directly. //! //! ### `metadata_mapper` //! -//! A function name to construct a new metadata or transform a protobuf metadata structure into a desired form. -//! The function signatures varies according to the existence of `metadata_mapper` and `metadata_type` as follows. +//! A function name to construct a new metadata or transform a protobuf metadata +//! structure into a desired form. The function signatures varies according to +//! the existence of `metadata_mapper` and `metadata_type` as follows. //! -//! - `metadata_mapper ` exists, `metadata_type` missing: `fn(&mut tipb::Expr) -> T` +//! - `metadata_mapper ` exists, `metadata_type` missing: `fn(&mut tipb::Expr) +//! -> T` //! //! Constructs a new metadata in type `T`. //! -//! - `metadata_mapper ` exists, `metadata_type` exists: `fn(MetaDataType, &mut tipb::Expr) -> T` +//! - `metadata_mapper ` exists, `metadata_type` exists: `fn(MetaDataType, &mut +//! tipb::Expr) -> T` //! -//! Transforms a protobuf metadata type `MetaDataType` specified by `metadata_type` into a new type `T`. +//! Transforms a protobuf metadata type `MetaDataType` specified by +//! `metadata_type` into a new type `T`. //! //! ### `capture` //! //! An array of argument names which are passed from the caller to the expanded -//! function. The argument names must be in scope in the generated `eval` or `run` -//! methods. Currently, that includes the following arguments (the supplied -//! function must accept these arguments with the corresponding types, in -//! addition to any other arguments): +//! function. The argument names must be in scope in the generated `eval` or +//! `run` methods. Currently, that includes the following arguments (the +//! supplied function must accept these arguments with the corresponding types, +//! in addition to any other arguments): //! //! * `ctx: &mut expr::EvalContext` //! * `output_rows: usize` @@ -111,35 +117,42 @@ //! This includes `varg` and `raw_varg`. //! //! The supplied function is preserved and a constructor function is generated -//! with a `_fn_meta` suffix, e.g., `#[rpn_fn] fn foo ...` will preserve `foo` and -//! generate `foo_fn_meta`. The constructor function returns an `rpn_expr::RpnFnMeta` -//! value. +//! with a `_fn_meta` suffix, e.g., `#[rpn_fn] fn foo ...` will preserve `foo` +//! and generate `foo_fn_meta`. The constructor function returns an +//! `rpn_expr::RpnFnMeta` value. //! -//! The constructor function will include code for validating the runtime arguments -//! and running the function, pointers to these functions are stored in the result. +//! The constructor function will include code for validating the runtime +//! arguments and running the function, pointers to these functions are stored +//! in the result. //! //! ### Non-vararg functions //! -//! Generate the following (examples assume a supplied function called `foo_bar`: +//! Generate the following (examples assume a supplied function called +//! `foo_bar`: //! -//! * A trait to represent the function (`FooBar_Fn`) with a single function `eval`. +//! * A trait to represent the function (`FooBar_Fn`) with a single function +//! `eval`. //! - An impl of that trait for all argument types which panics -//! - An impl of that trait for the supported argument type which calls the supplied function. -//! * An evaluator struct (`FooBar_Evaluator`) which implements `rpn_expr::function::Evaluator`, -//! which includes an `eval` method which dispatches to `FooBar_Fn::eval`. +//! - An impl of that trait for the supported argument type which calls the +//! supplied function. +//! * An evaluator struct (`FooBar_Evaluator`) which implements +//! `rpn_expr::function::Evaluator`, which includes an `eval` method which +//! dispatches to `FooBar_Fn::eval`. //! * A constructor function similar to the vararg case. //! //! The supplied function is preserved. //! -//! The supported argument type is represented as a type-level list, for example, a -//! a function which takes two unsigned ints has an argument representation -//! something like `Arg>`. See documentation in -//! `components/tidb_query_expr/src/types/function.rs` for more details. +//! The supported argument type is represented as a type-level list, for +//! example, a a function which takes two unsigned ints has an argument +//! representation something like `Arg>`. See +//! documentation in `components/tidb_query_expr/src/types/function.rs` for more +//! details. //! -//! The `_Fn` trait can be customised by implementing it manually. -//! For example, you are going to implement an RPN function called `regex_match` taking two -//! arguments, the regex and the string to match. You want to build the regex only once if the -//! first argument is a scalar. The code may look like: +//! The `_Fn` trait can be customized by implementing it manually. +//! For example, you are going to implement an RPN function called `regex_match` +//! taking two arguments, the regex and the string to match. You want to build +//! the regex only once if the first argument is a scalar. The code may look +//! like: //! //! ```ignore //! fn regex_match_impl(regex: &Regex, text: Option<&Bytes>) -> Result> { @@ -175,8 +188,9 @@ //! } //! ``` //! -//! If the RPN function accepts variable number of arguments and all arguments have the same eval -//! type, like RPN function `coalesce`, you can use `#[rpn_fn(varg)]` like: +//! If the RPN function accepts variable number of arguments and all arguments +//! have the same eval type, like RPN function `coalesce`, you can use +//! `#[rpn_fn(varg)]` like: //! //! ```ignore //! #[rpn_fn(varg)] @@ -220,10 +234,12 @@ mod kw { /// Parses an attribute like `#[rpn_fn(varg, capture = [ctx, output_rows])`. #[derive(Debug)] struct RpnFnAttr { - /// Whether or not the function is a varg function. Varg function accepts `&[&Option]`. + /// Whether or not the function is a varg function. Varg function accepts + /// `&[&Option]`. is_varg: bool, - /// Whether or not the function is a raw varg function. Raw varg function accepts `&[ScalarValueRef]`. + /// Whether or not the function is a raw varg function. Raw varg function + /// accepts `&[ScalarValueRef]`. is_raw_varg: bool, /// Whether or not the function needs extra logic on `None` value. @@ -234,8 +250,9 @@ struct RpnFnAttr { /// The maximum accepted arguments, which will be checked by the validator. /// - /// Only varg or raw_varg function accepts a range of number of arguments. Other kind of - /// function strictly stipulates number of arguments according to the function definition. + /// Only varg or raw_varg function accepts a range of number of arguments. + /// Other kind of function strictly stipulates number of arguments + /// according to the function definition. max_args: Option, /// The minimal accepted arguments, which will be checked by the validator. @@ -368,7 +385,7 @@ impl parse::Parse for RpnFnAttr { )); } - if !is_varg && !is_raw_varg && (min_args != None || max_args != None) { + if !is_varg && !is_raw_varg && (min_args.is_some() || max_args.is_some()) { return Err(Error::new_spanned( config_items, "`min_args` or `max_args` is only available when `varg` or `raw_varg` presents", @@ -411,7 +428,8 @@ impl parse::Parse for RpnFnAttr { } } -/// Parses an evaluable type like `Option<&T>`, `Option`, `Option`, `Option` or `Option`. +/// Parses an evaluable type like `Option<&T>`, `Option`, +/// `Option`, `Option` or `Option`. struct RpnFnRefEvaluableTypeWithOption(RpnFnRefEvaluableType); impl parse::Parse for RpnFnRefEvaluableTypeWithOption { @@ -504,8 +522,8 @@ impl parse::Parse for RpnFnRefEvaluableType { } /// Parses a function signature parameter like `val: &Option` or `val: &T`. -/// If input has &Option, set has_option to true; otherwise, set has_option to false. -/// Caller can use has_option to check if input is valid. +/// If input has &Option, set has_option to true; otherwise, set has_option +/// to false. Caller can use has_option to check if input is valid. struct RpnFnSignatureParam { _pat: Pat, has_option: bool, @@ -531,9 +549,9 @@ impl parse::Parse for RpnFnSignatureParam { } } -/// Parses a function signature parameter like `val: &[&Option]` or `val: &[&T]`. -/// If input has &Option, set has_option to true; otherwise, set has_option to false. -/// Caller can use has_option to check if input is valid. +/// Parses a function signature parameter like `val: &[&Option]` or `val: +/// &[&T]`. If input has &Option, set has_option to true; otherwise, set +/// has_option to false. Caller can use has_option to check if input is valid. struct VargsRpnFnSignatureParam { _pat: Pat, has_option: bool, diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 2f42c226327..f192b22a5f6 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -7,17 +7,21 @@ description = "Common utility of a query engine to run TiDB pushed down executor [dependencies] anyhow = "1.0" +api_version = { workspace = true } +async-trait = "0.1" derive_more = "0.99.3" -error_code = { path = "../error_code", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +error_code = { workspace = true } +futures = "0.3" +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" serde_json = "1.0" thiserror = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } time = "0.1" +yatp = { workspace = true } [dev-dependencies] byteorder = "1.2" diff --git a/components/tidb_query_common/src/error.rs b/components/tidb_query_common/src/error.rs index 8697413f69c..046e2f02059 100644 --- a/components/tidb_query_common/src/error.rs +++ b/components/tidb_query_common/src/error.rs @@ -90,8 +90,9 @@ impl ErrorCodeExt for EvaluateError { #[error(transparent)] pub struct StorageError(#[from] pub anyhow::Error); -/// We want to restrict the type of errors to be either a `StorageError` or `EvaluateError`, thus -/// `failure::Error` is not used. Instead, we introduce our own error enum. +/// We want to restrict the type of errors to be either a `StorageError` or +/// `EvaluateError`, thus `failure::Error` is not used. Instead, we introduce +/// our own error enum. #[derive(Debug, Error)] pub enum ErrorInner { #[error("Storage error: {0}")] diff --git a/components/tidb_query_common/src/execute_stats.rs b/components/tidb_query_common/src/execute_stats.rs index 2318ad43e16..122363eed98 100644 --- a/components/tidb_query_common/src/execute_stats.rs +++ b/components/tidb_query_common/src/execute_stats.rs @@ -4,7 +4,7 @@ use derive_more::{Add, AddAssign}; /// Execution summaries to support `EXPLAIN ANALYZE` statements. We don't use /// `ExecutorExecutionSummary` directly since it is less efficient. -#[derive(Debug, Default, Copy, Clone, Add, AddAssign, PartialEq, Eq)] +#[derive(Debug, Default, Copy, Clone, Add, AddAssign, PartialEq)] pub struct ExecSummary { /// Total time cost in this executor. pub time_processed_ns: usize, @@ -18,7 +18,7 @@ pub struct ExecSummary { /// A trait for all execution summary collectors. pub trait ExecSummaryCollector: Send { - type DurationRecorder; + type DurationRecorder: Send; /// Creates a new instance with specified output slot index. fn new(output_index: usize) -> Self @@ -76,7 +76,8 @@ impl ExecSummaryCollector for ExecSummaryCollectorEnabled { } } -/// A `ExecSummaryCollector` that does not collect anything. Acts like `collect = false`. +/// A `ExecSummaryCollector` that does not collect anything. Acts like `collect +/// = false`. pub struct ExecSummaryCollectorDisabled; impl ExecSummaryCollector for ExecSummaryCollectorDisabled { @@ -105,11 +106,11 @@ pub struct WithSummaryCollector { pub inner: T, } -/// Execution statistics to be flowed between parent and child executors at once during -/// `collect_exec_stats()` invocation. +/// Execution statistics to be flowed between parent and child executors at once +/// during `collect_exec_stats()` invocation. pub struct ExecuteStats { - /// The execution summary of each executor. If execution summary is not needed, it will - /// be zero sized. + /// The execution summary of each executor. If execution summary is not + /// needed, it will be zero sized. pub summary_per_executor: Vec, /// For each range given in the request, how many rows are scanned. @@ -119,8 +120,8 @@ pub struct ExecuteStats { impl ExecuteStats { /// Creates a new statistics instance. /// - /// If execution summary does not need to be collected, it is safe to pass 0 to the `executors` - /// argument, which will avoid one allocation. + /// If execution summary does not need to be collected, it is safe to pass 0 + /// to the `executors` argument, which will avoid one allocation. pub fn new(executors_len: usize) -> Self { Self { summary_per_executor: vec![ExecSummary::default(); executors_len], diff --git a/components/tidb_query_common/src/storage/mod.rs b/components/tidb_query_common/src/storage/mod.rs index 818b863d0a4..f8d9f37723d 100644 --- a/components/tidb_query_common/src/storage/mod.rs +++ b/components/tidb_query_common/src/storage/mod.rs @@ -11,8 +11,8 @@ pub type Result = std::result::Result; pub type OwnedKvPair = (Vec, Vec); -/// The abstract storage interface. The table scan and index scan executor relies on a `Storage` -/// implementation to provide source data. +/// The abstract storage interface. The table scan and index scan executor +/// relies on a `Storage` implementation to provide source data. pub trait Storage: Send { type Statistics; diff --git a/components/tidb_query_common/src/storage/range.rs b/components/tidb_query_common/src/storage/range.rs index b4075fb3b60..b826f55fe46 100644 --- a/components/tidb_query_common/src/storage/range.rs +++ b/components/tidb_query_common/src/storage/range.rs @@ -4,7 +4,7 @@ use kvproto::coprocessor::KeyRange; // TODO: Remove this module after switching to DAG v2. -#[derive(PartialEq, Eq, Clone)] +#[derive(PartialEq, Clone)] pub enum Range { Point(PointRange), Interval(IntervalRange), @@ -41,7 +41,7 @@ impl From for Range { } } -#[derive(Default, PartialEq, Eq, Clone)] +#[derive(Default, PartialEq, Clone)] pub struct IntervalRange { pub lower_inclusive: Vec, pub upper_exclusive: Vec, @@ -87,7 +87,7 @@ impl<'a, 'b> From<(&'a str, &'b str)> for IntervalRange { } } -#[derive(Default, PartialEq, Eq, Clone)] +#[derive(Default, PartialEq, Clone)] pub struct PointRange(pub Vec); impl std::fmt::Debug for PointRange { diff --git a/components/tidb_query_common/src/storage/ranges_iter.rs b/components/tidb_query_common/src/storage/ranges_iter.rs index 88d103a763f..b872d8c5bc5 100644 --- a/components/tidb_query_common/src/storage/ranges_iter.rs +++ b/components/tidb_query_common/src/storage/ranges_iter.rs @@ -2,17 +2,17 @@ use super::range::Range; -#[derive(PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Clone, Debug)] pub enum IterStatus { /// All ranges are consumed. Drained, - /// Last range is drained or this iteration is a fresh start so that caller should scan - /// on a new range. + /// Last range is drained or this iteration is a fresh start so that caller + /// should scan on a new range. NewRange(Range), - /// Last interval range is not drained and the caller should continue scanning without changing - /// the scan range. + /// Last interval range is not drained and the caller should continue + /// scanning without changing the scan range. Continue, } @@ -23,13 +23,14 @@ pub enum IterStatus { /// - a flag indicating continuing last interval range /// - a flag indicating that all ranges are consumed /// -/// If a new range is returned, caller can then scan unknown amount of key(s) within this new range. -/// The caller must inform the structure so that it will emit a new range next time by calling -/// `notify_drained()` after current range is drained. Multiple `notify_drained()` without `next()` -/// will have no effect. +/// If a new range is returned, caller can then scan unknown amount of key(s) +/// within this new range. The caller must inform the structure so that it will +/// emit a new range next time by calling `notify_drained()` after current range +/// is drained. Multiple `notify_drained()` without `next()` will have no +/// effect. pub struct RangesIterator { - /// Whether or not we are processing a valid range. If we are not processing a range, or there - /// is no range any more, this field is `false`. + /// Whether or not we are processing a valid range. If we are not processing + /// a range, or there is no range any more, this field is `false`. in_range: bool, iter: std::vec::IntoIter, @@ -64,6 +65,12 @@ impl RangesIterator { pub fn notify_drained(&mut self) { self.in_range = false; } + + /// Check drained. + #[inline] + pub fn is_drained(&mut self) -> bool { + self.iter.len() == 0 + } } #[cfg(test)] diff --git a/components/tidb_query_common/src/storage/scanner.rs b/components/tidb_query_common/src/storage/scanner.rs index 6e72ba13fca..d0d2345a09e 100644 --- a/components/tidb_query_common/src/storage/scanner.rs +++ b/components/tidb_query_common/src/storage/scanner.rs @@ -1,13 +1,24 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::{marker::PhantomData, time::Duration}; + +use api_version::KvFormat; +use tikv_util::time::Instant; +use yatp::task::future::reschedule; + use super::{range::*, ranges_iter::*, OwnedKvPair, Storage}; use crate::error::StorageError; const KEY_BUFFER_CAPACITY: usize = 64; - -/// A scanner that scans over multiple ranges. Each range can be a point range containing only -/// one row, or an interval range containing multiple rows. -pub struct RangesScanner { +/// Batch executors are run in coroutines. `MAX_TIME_SLICE` is the maximum time +/// a coroutine can run without being yielded. +const MAX_TIME_SLICE: Duration = Duration::from_millis(1); +/// the number of scanned keys that should trigger a reschedule. +const CHECK_KEYS: usize = 32; + +/// A scanner that scans over multiple ranges. Each range can be a point range +/// containing only one row, or an interval range containing multiple rows. +pub struct RangesScanner { storage: T, ranges_iter: RangesIterator, @@ -23,6 +34,37 @@ pub struct RangesScanner { current_range: IntervalRange, working_range_begin_key: Vec, working_range_end_key: Vec, + rescheduler: RescheduleChecker, + + _phantom: PhantomData, +} + +// TODO: maybe it's better to make it generic to avoid directly depending +// on yatp's rescheduler. +struct RescheduleChecker { + prev_start: Instant, + prev_key_count: usize, +} + +impl RescheduleChecker { + fn new() -> Self { + Self { + prev_start: Instant::now(), + prev_key_count: 0, + } + } + + #[inline(always)] + async fn check_reschedule(&mut self, force_check: bool) { + self.prev_key_count += 1; + if (force_check || self.prev_key_count % CHECK_KEYS == 0) + && self.prev_start.saturating_elapsed() > MAX_TIME_SLICE + { + reschedule().await; + self.prev_start = Instant::now(); + self.prev_key_count = 0; + } + } } pub struct RangesScannerOptions { @@ -33,7 +75,7 @@ pub struct RangesScannerOptions { pub is_scanned_range_aware: bool, // TODO: This can be const generics } -impl RangesScanner { +impl RangesScanner { pub fn new( RangesScannerOptions { storage, @@ -42,7 +84,7 @@ impl RangesScanner { is_key_only, is_scanned_range_aware, }: RangesScannerOptions, - ) -> RangesScanner { + ) -> RangesScanner { let ranges_len = ranges.len(); let ranges_iter = RangesIterator::new(ranges); RangesScanner { @@ -58,14 +100,27 @@ impl RangesScanner { }, working_range_begin_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), working_range_end_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), + rescheduler: RescheduleChecker::new(), + _phantom: PhantomData, } } /// Fetches next row. // Note: This is not implemented over `Iterator` since it can fail. // TODO: Change to use reference to avoid allocation and copy. - pub fn next(&mut self) -> Result, StorageError> { + pub async fn next(&mut self) -> Result, StorageError> { + self.next_opt(true).await + } + + /// Fetches next row. + /// Note: `update_scanned_range` can control whether update the scanned + /// range when `is_scanned_range_aware` is true. + pub async fn next_opt( + &mut self, + update_scanned_range: bool, + ) -> Result, StorageError> { loop { + let mut force_check = true; let range = self.ranges_iter.next(); let some_row = match range { IterStatus::NewRange(Range::Point(r)) => { @@ -85,7 +140,10 @@ impl RangesScanner { .begin_scan(self.scan_backward_in_range, self.is_key_only, r)?; self.storage.scan_next()? } - IterStatus::Continue => self.storage.scan_next()?, + IterStatus::Continue => { + force_check = false; + self.storage.scan_next()? + } IterStatus::Drained => { if self.is_scanned_range_aware { self.update_working_range_end_key(); @@ -93,16 +151,17 @@ impl RangesScanner { return Ok(None); // drained } }; - if self.is_scanned_range_aware { + if self.is_scanned_range_aware && update_scanned_range { self.update_scanned_range_from_scanned_row(&some_row); } - if some_row.is_some() { + if let Some(row) = some_row { // Retrieved one row from point range or interval range. if let Some(r) = self.scanned_rows_per_range.last_mut() { *r += 1; } - - return Ok(some_row); + self.rescheduler.check_reschedule(force_check).await; + let kv = F::make_kv_pair(row).map_err(|e| StorageError(anyhow::Error::from(e)))?; + return Ok(Some(kv)); } else { // No more row in the range. self.ranges_iter.notify_drained(); @@ -110,14 +169,14 @@ impl RangesScanner { } } - /// Appends storage statistics collected so far to the given container and clears the - /// collected statistics. + /// Appends storage statistics collected so far to the given container and + /// clears the collected statistics. pub fn collect_storage_stats(&mut self, dest: &mut T::Statistics) { self.storage.collect_statistics(dest) } - /// Appends scanned rows of each range so far to the given container and clears the - /// collected statistics. + /// Appends scanned rows of each range so far to the given container and + /// clears the collected statistics. pub fn collect_scanned_rows_per_range(&mut self, dest: &mut Vec) { dest.append(&mut self.scanned_rows_per_range); self.scanned_rows_per_range.push(0); @@ -159,31 +218,35 @@ impl RangesScanner { fn update_scanned_range_from_new_point(&mut self, point: &PointRange) { assert!(self.is_scanned_range_aware); - self.update_working_range_end_key(); - self.current_range.lower_inclusive.clear(); - self.current_range.upper_exclusive.clear(); - self.current_range - .lower_inclusive - .extend_from_slice(&point.0); - self.current_range - .upper_exclusive - .extend_from_slice(&point.0); - self.current_range.upper_exclusive.push(0); + // Only update current_range for the first and the last range. + if self.current_range.lower_inclusive.is_empty() || self.ranges_iter.is_drained() { + self.current_range.lower_inclusive.clear(); + self.current_range.upper_exclusive.clear(); + self.current_range + .lower_inclusive + .extend_from_slice(&point.0); + self.current_range + .upper_exclusive + .extend_from_slice(&point.0); + self.current_range.upper_exclusive.push(0); + } self.update_working_range_begin_key(); } fn update_scanned_range_from_new_range(&mut self, range: &IntervalRange) { assert!(self.is_scanned_range_aware); - self.update_working_range_end_key(); - self.current_range.lower_inclusive.clear(); - self.current_range.upper_exclusive.clear(); - self.current_range - .lower_inclusive - .extend_from_slice(&range.lower_inclusive); - self.current_range - .upper_exclusive - .extend_from_slice(&range.upper_exclusive); + // Only update current_range for the first and the last range. + if self.current_range.lower_inclusive.is_empty() || self.ranges_iter.is_drained() { + self.current_range.lower_inclusive.clear(); + self.current_range.upper_exclusive.clear(); + self.current_range + .lower_inclusive + .extend_from_slice(&range.lower_inclusive); + self.current_range + .upper_exclusive + .extend_from_slice(&range.upper_exclusive); + } self.update_working_range_begin_key(); } @@ -229,6 +292,9 @@ impl RangesScanner { #[cfg(test)] mod tests { + use api_version::{keyspace::KvPair, ApiV1}; + use futures::executor::block_on; + use super::*; use crate::storage::{test_fixture::FixtureStorage, IntervalRange, PointRange, Range}; @@ -254,7 +320,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "c")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -262,26 +328,26 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - scanner.next().unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"bar_2".to_vec(), b"4".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), b"4".to_vec()) ); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); // Backward in range let ranges: Vec = vec![ @@ -290,7 +356,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "bar_2")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -298,22 +364,22 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - scanner.next().unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); // Key only let ranges: Vec = vec![ @@ -321,28 +387,34 @@ mod tests { PointRange::from("foo_3").into(), PointRange::from("bar_3").into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, is_key_only: true, is_scanned_range_aware: false, }); - assert_eq!(scanner.next().unwrap(), Some((b"bar".to_vec(), Vec::new()))); assert_eq!( - scanner.next().unwrap(), - Some((b"bar_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), Vec::new()) + ); + assert_eq!( + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), Vec::new()) + ); + assert_eq!( + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), Vec::new()) ); - assert_eq!(scanner.next().unwrap(), Some((b"foo".to_vec(), Vec::new()))); assert_eq!( - scanner.next().unwrap(), - Some((b"foo_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), Vec::new()) ); assert_eq!( - scanner.next().unwrap(), - Some((b"foo_3".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), Vec::new()) ); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); } #[test] @@ -355,7 +427,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "z")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -364,9 +436,9 @@ mod tests { }); let mut scanned_rows_per_range = Vec::new(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2, 0, 1]); @@ -376,28 +448,28 @@ mod tests { assert_eq!(scanned_rows_per_range, vec![0]); scanned_rows_per_range.clear(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0, 2]); scanned_rows_per_range.clear(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![1]); scanned_rows_per_range.clear(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); + assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2]); scanned_rows_per_range.clear(); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0]); @@ -410,7 +482,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -422,7 +494,7 @@ mod tests { assert_eq!(&r.lower_inclusive, b""); assert_eq!(&r.upper_exclusive, b""); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b""); @@ -430,7 +502,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -438,7 +510,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -446,7 +518,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -454,7 +526,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -462,7 +534,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -470,28 +542,28 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"foo_3\0"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_3\0"); assert_eq!(&r.upper_exclusive, b"foo_8"); // Multiple ranges - // TODO: caller should not pass in unordered ranges otherwise scanned ranges would be - // unsound. + // TODO: caller should not pass in unordered ranges otherwise scanned ranges + // would be unsound. let ranges = vec![ IntervalRange::from(("foo", "foo_3")).into(), IntervalRange::from(("foo_5", "foo_50")).into(), @@ -500,7 +572,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -508,31 +580,31 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo\0"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"bar\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar\0"); assert_eq!(&r.upper_exclusive, b"bar_2\0"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2\0"); @@ -545,7 +617,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -557,7 +629,7 @@ mod tests { assert_eq!(&r.lower_inclusive, b""); assert_eq!(&r.upper_exclusive, b""); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b""); @@ -565,7 +637,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -573,7 +645,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -581,7 +653,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -589,7 +661,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -597,7 +669,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -605,20 +677,20 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2"); assert_eq!(&r.upper_exclusive, b"foo_8"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -633,7 +705,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -641,29 +713,241 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2"); assert_eq!(&r.upper_exclusive, b"box"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar"); assert_eq!(&r.upper_exclusive, b"bar_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"bar"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo"); } + + #[test] + fn test_scanned_range_forward2() { + let storage = create_storage(); + // Filled interval range + let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { + storage: storage.clone(), + ranges, + scan_backward_in_range: false, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Only lower_inclusive is updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), + b"foo_2" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Upper_exclusive is not updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo_3" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Drained. + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_8"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"foo_8"); + + // Multiple ranges + // TODO: caller should not pass in unordered ranges otherwise scanned ranges + // would be unsound. + let ranges = vec![ + IntervalRange::from(("foo", "foo_3")).into(), + IntervalRange::from(("foo_5", "foo_50")).into(), + IntervalRange::from(("bar", "bar_")).into(), + PointRange::from("bar_2").into(), + PointRange::from("bar_3").into(), + IntervalRange::from(("bar_4", "box")).into(), + ]; + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { + storage, + ranges, + scan_backward_in_range: false, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Only lower_inclusive is updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. Updated by scanned row. + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), + b"foo_2" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Upper_exclusive is not updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"bar" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Upper_exclusive is not updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"bar_2" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Drain. + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"box"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"box"); + } + + #[test] + fn test_scanned_range_backward2() { + let storage = create_storage(); + // Filled interval range + let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { + storage: storage.clone(), + ranges, + scan_backward_in_range: true, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Only lower_inclusive is updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo_3" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), + b"foo_2" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b"foo_2"); + + // Upper_exclusive is not updated. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo" + ); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b"foo_2"); + + // Drained. + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b"foo"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"foo_8"); + + // Multiple ranges + let ranges = vec![ + IntervalRange::from(("bar_4", "box")).into(), + PointRange::from("bar_3").into(), + PointRange::from("bar_2").into(), + IntervalRange::from(("bar", "bar_")).into(), + IntervalRange::from(("foo_5", "foo_50")).into(), + IntervalRange::from(("foo", "foo_3")).into(), + ]; + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { + storage, + ranges, + scan_backward_in_range: true, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Lower_inclusive is updated. Upper_exclusive is not update. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"bar_2" + ); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. Updated by scanned row. + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), + b"bar" + ); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"bar"); + + // Upper_exclusive is not update. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo_2" + ); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"bar"); + + // Upper_exclusive is not update. + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), + b"foo" + ); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"bar"); + + // Drain. + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"foo"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"box"); + } } diff --git a/components/tidb_query_common/src/storage/test_fixture.rs b/components/tidb_query_common/src/storage/test_fixture.rs index a10726b5347..305bc5bf168 100644 --- a/components/tidb_query_common/src/storage/test_fixture.rs +++ b/components/tidb_query_common/src/storage/test_fixture.rs @@ -11,7 +11,8 @@ type ErrorBuilder = Box crate::error::StorageError>; type FixtureValue = std::result::Result, ErrorBuilder>; -/// A `Storage` implementation that returns fixed source data (i.e. fixture). Useful in tests. +/// A `Storage` implementation that returns fixed source data (i.e. fixture). +/// Useful in tests. #[derive(Clone)] pub struct FixtureStorage { data: Arc, FixtureValue>>, @@ -69,8 +70,8 @@ impl super::Storage for FixtureStorage { fn scan_next(&mut self) -> Result> { let value = if !self.is_backward_scan { - // During the call of this function, `data` must be valid and we are only returning - // data clones to outside, so this access is safe. + // During the call of this function, `data` must be valid and we are only + // returning data clones to outside, so this access is safe. self.data_view_unsafe.as_mut().unwrap().next() } else { self.data_view_unsafe.as_mut().unwrap().next_back() diff --git a/components/tidb_query_common/src/util.rs b/components/tidb_query_common/src/util.rs index 9ee2a059073..9f9b60bf9f7 100644 --- a/components/tidb_query_common/src/util.rs +++ b/components/tidb_query_common/src/util.rs @@ -40,8 +40,8 @@ pub fn is_prefix_next(key: &[u8], next: &[u8]) -> bool { let mut carry_pos = len; loop { if carry_pos == 0 { - // All bytes of `key` are 255. `next` couldn't be `key`'s prefix_next since their - // lengths are equal. + // All bytes of `key` are 255. `next` couldn't be `key`'s prefix_next since + // their lengths are equal. return false; } @@ -71,8 +71,8 @@ pub fn is_prefix_next(key: &[u8], next: &[u8]) -> bool { && next[carry_pos + 1..].iter().all(|byte| *byte == 0) && key[..carry_pos] == next[..carry_pos] } else if len + 1 == next_len { - // `next` must has one more 0 than `key`, and the first `len` bytes must be all 255. - // The case that `len == 0` is also covered here. + // `next` must has one more 0 than `key`, and the first `len` bytes must be all + // 255. The case that `len == 0` is also covered here. *next.last().unwrap() == 0 && key.iter().all(|byte| *byte == 255) && next.iter().take(len).all(|byte| *byte == 255) diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index 698ebc8049c..97fb2d101b6 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -6,35 +6,38 @@ publish = false description = "Data type of a query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } +base64 = "0.13" bitfield = "0.13.2" bitflags = "1.0.1" boolinator = "2.4.0" bstr = "0.2.8" chrono = "0.4" chrono-tz = "0.5.1" -codec = { path = "../codec", default-features = false } -collections = { path = "../collections" } +codec = { workspace = true } +collections = { workspace = true } +crc32fast = "1.2" encoding_rs = { git = "https://github.com/xiongjiwei/encoding_rs.git", rev = "68e0bc5a72a37a78228d80cd98047326559cf43c" } -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -match_template = { path = "../match_template" } -nom = { version = "5.1.0", default-features = false, features = ["std"] } +log_wrappers = { workspace = true } +match-template = "0.0.1" +nom = { version = "7.1.0", default-features = false, features = ["std"] } num = { version = "0.3", default-features = false } num-derive = "0.3" num-traits = "0.2" -ordered-float = "1.0" +ordered-float = "2.0" protobuf = "2" regex = "1.1" serde = "1.0" serde_json = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } static_assertions = { version = "1.0", features = ["nightly"] } thiserror = "1.0" -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tidb_query_common = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +tipb = { workspace = true } diff --git a/components/tidb_query_datatype/src/codec/batch/lazy_column.rs b/components/tidb_query_datatype/src/codec/batch/lazy_column.rs index dcd6328ca18..11d290f9c31 100644 --- a/components/tidb_query_datatype/src/codec/batch/lazy_column.rs +++ b/components/tidb_query_datatype/src/codec/batch/lazy_column.rs @@ -16,13 +16,14 @@ use crate::{ match_template_evaltype, EvalType, FieldTypeAccessor, }; -/// A container stores an array of datums, which can be either raw (not decoded), or decoded into -/// the `VectorValue` type. +/// A container stores an array of datums, which can be either raw (not +/// decoded), or decoded into the `VectorValue` type. /// /// TODO: -/// Since currently the data format in response can be the same as in storage, we use this structure -/// to avoid unnecessary repeated serialization / deserialization. In future, Coprocessor will -/// respond all data in Chunk format which is different to the format in storage. At that time, +/// Since currently the data format in response can be the same as in storage, +/// we use this structure to avoid unnecessary repeated serialization / +/// deserialization. In future, Coprocessor will respond all data in Chunk +/// format which is different to the format in storage. At that time, /// this structure is no longer useful and should be removed. #[derive(Clone, Debug)] pub enum LazyBatchColumn { @@ -42,14 +43,16 @@ impl LazyBatchColumn { #[inline] pub fn raw_with_capacity(capacity: usize) -> Self { use codec::number::MAX_VARINT64_LENGTH; - // We assume that each element *may* has a size of MAX_VAR_INT_LEN + Datum Flag (1 byte). + // We assume that each element *may* has a size of MAX_VAR_INT_LEN + Datum Flag + // (1 byte). LazyBatchColumn::Raw(BufferVec::with_capacity( capacity, capacity * (MAX_VARINT64_LENGTH + 1), )) } - /// Creates a new `LazyBatchColumn::Decoded` with specified capacity and eval type. + /// Creates a new `LazyBatchColumn::Decoded` with specified capacity and + /// eval type. #[inline] pub fn decoded_with_capacity_and_tp(capacity: usize, eval_tp: EvalType) -> Self { LazyBatchColumn::Decoded(VectorValue::with_capacity(capacity, eval_tp)) @@ -150,14 +153,16 @@ impl LazyBatchColumn { } } - /// Decodes this column if the column is not decoded, according to the given logical rows map. - /// After decoding, the decoded column will have the same physical layout as the encoded one - /// (i.e. the same logical rows), but elements in unnecessary positions will not be decoded - /// and will be `None`. + /// Decodes this column if the column is not decoded, according to the given + /// logical rows map. After decoding, the decoded column will have the same + /// physical layout as the encoded one (i.e. the same logical rows), but + /// elements in unnecessary positions will not be decoded and will be + /// `None`. /// - /// The field type is needed because we use the same `DateTime` structure when handling - /// Date, Time or Timestamp. - // TODO: Maybe it's a better idea to assign different eval types for different date types. + /// The field type is needed because we use the same `DateTime` structure + /// when handling Date, Time or Timestamp. + // TODO: Maybe it's a better idea to assign different eval types for different + // date types. pub fn ensure_decoded( &mut self, ctx: &mut EvalContext, @@ -358,7 +363,8 @@ mod tests { assert!(col.is_decoded()); assert_eq!(col.len(), 3); assert_eq!(col.capacity(), 3); - // Element 1 is None because it is not referred in `logical_rows` and we don't decode it. + // Element 1 is None because it is not referred in `logical_rows` and we don't + // decode it. assert_eq!(col.decoded().to_int_vec(), &[Some(32), None, Some(10)]); { @@ -370,7 +376,8 @@ mod tests { assert_eq!(col.decoded().to_int_vec(), &[Some(32), None, Some(10)]); } - // Decode a decoded column, even using a different logical rows, does not have effect. + // Decode a decoded column, even using a different logical rows, does not have + // effect. col.ensure_decoded( &mut ctx, &FieldTypeTp::Long.into(), @@ -435,7 +442,8 @@ mod benches { /// Bench performance of decoding a raw batch column. /// - /// Note that there is a clone in the bench suite, whose cost should be excluded. + /// Note that there is a clone in the bench suite, whose cost should be + /// excluded. #[bench] fn bench_lazy_batch_column_clone_and_decode(b: &mut test::Bencher) { use crate::{ @@ -471,7 +479,8 @@ mod benches { /// Bench performance of decoding a decoded lazy batch column. /// - /// Note that there is a clone in the bench suite, whose cost should be excluded. + /// Note that there is a clone in the bench suite, whose cost should be + /// excluded. #[bench] fn bench_lazy_batch_column_clone_and_decode_decoded(b: &mut test::Bencher) { use crate::{ diff --git a/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs b/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs index d4f7ea9044a..55a07e72ae7 100644 --- a/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs +++ b/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs @@ -13,7 +13,8 @@ use crate::{ /// Stores multiple `LazyBatchColumn`s. Each column has an equal length. #[derive(Clone, Debug)] pub struct LazyBatchColumnVec { - /// Multiple lazy batch columns. Each column is either decoded, or not decoded. + /// Multiple lazy batch columns. Each column is either decoded, or not + /// decoded. /// /// For decoded columns, they may be in different types. If the column is in /// type `LazyBatchColumn::Raw`, it means that it is not decoded. @@ -37,9 +38,11 @@ impl From> for LazyBatchColumnVec { } impl LazyBatchColumnVec { - /// Creates a new empty `LazyBatchColumnVec`, which does not have columns and rows. + /// Creates a new empty `LazyBatchColumnVec`, which does not have columns + /// and rows. /// - /// Because column numbers won't change, it means constructed instance will be always empty. + /// Because column numbers won't change, it means constructed instance will + /// be always empty. #[inline] pub fn empty() -> Self { Self { @@ -47,7 +50,8 @@ impl LazyBatchColumnVec { } } - /// Creates a new empty `LazyBatchColumnVec` with the same number of columns and schema. + /// Creates a new empty `LazyBatchColumnVec` with the same number of columns + /// and schema. #[inline] #[must_use] pub fn clone_empty(&self, capacity: usize) -> Self { @@ -60,7 +64,8 @@ impl LazyBatchColumnVec { } } - /// Creates a new `LazyBatchColumnVec`, which contains `columns_count` number of raw columns. + /// Creates a new `LazyBatchColumnVec`, which contains `columns_count` + /// number of raw columns. #[cfg(test)] #[must_use] pub fn with_raw_columns(columns_count: usize) -> Self { @@ -160,8 +165,8 @@ impl LazyBatchColumnVec { Ok(()) } - /// Truncates columns into equal length. The new length of all columns would be the length of - /// the shortest column before calling this function. + /// Truncates columns into equal length. The new length of all columns would + /// be the length of the shortest column before calling this function. pub fn truncate_into_equal_length(&mut self) { let mut min_len = self.rows_len(); for col in &self.columns { @@ -184,8 +189,8 @@ impl LazyBatchColumnVec { } } -// Do not implement Deref, since we want to forbid some misleading function calls like -// `LazyBatchColumnVec.len()`. +// Do not implement Deref, since we want to forbid some misleading function +// calls like `LazyBatchColumnVec.len()`. impl Index for LazyBatchColumnVec { type Output = LazyBatchColumn; diff --git a/components/tidb_query_datatype/src/codec/chunk/chunk.rs b/components/tidb_query_datatype/src/codec/chunk/chunk.rs index 2cf1261f7dc..b4478c8a4d3 100644 --- a/components/tidb_query_datatype/src/codec/chunk/chunk.rs +++ b/components/tidb_query_datatype/src/codec/chunk/chunk.rs @@ -10,8 +10,9 @@ use super::{ use crate::{codec::Datum, FieldTypeAccessor}; /// `Chunk` stores multiple rows of data. -/// Values are appended in compact format and can be directly accessed without decoding. -/// When the chunk is done processing, we can reuse the allocated memory by resetting it. +/// Values are appended in compact format and can be directly accessed without +/// decoding. When the chunk is done processing, we can reuse the allocated +/// memory by resetting it. pub struct Chunk { columns: Vec, } @@ -32,7 +33,8 @@ impl Chunk { } /// Reset the chunk, so the memory it allocated can be reused. - /// Make sure all the data in the chunk is not used anymore before you reuse this chunk. + /// Make sure all the data in the chunk is not used anymore before you reuse + /// this chunk. pub fn reset(&mut self) { for column in &mut self.columns { column.reset(); @@ -186,7 +188,7 @@ mod tests { FieldTypeTp::DateTime.into(), FieldTypeTp::Duration.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), FieldTypeTp::String.into(), ]; let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); @@ -227,7 +229,7 @@ mod tests { FieldTypeTp::DateTime.into(), FieldTypeTp::Duration.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), FieldTypeTp::String.into(), ]; let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); @@ -327,7 +329,7 @@ mod tests { fn bench_encode_from_raw_json_datum(b: &mut Bencher) { let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); let datum = Datum::Json(json); - bench_encode_from_raw_datum_impl(b, datum, FieldTypeTp::JSON); + bench_encode_from_raw_datum_impl(b, datum, FieldTypeTp::Json); } #[test] @@ -339,7 +341,7 @@ mod tests { FieldTypeTp::VarChar.into(), FieldTypeTp::VarChar.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), ]; let mut chunk = Chunk::new(&fields, rows); diff --git a/components/tidb_query_datatype/src/codec/chunk/column.rs b/components/tidb_query_datatype/src/codec/chunk/column.rs index b8f7e4b9da6..ef1c2602864 100644 --- a/components/tidb_query_datatype/src/codec/chunk/column.rs +++ b/components/tidb_query_datatype/src/codec/chunk/column.rs @@ -316,7 +316,7 @@ impl Column { } FieldTypeTp::Duration => Datum::Dur(self.get_duration(idx, field_type.decimal())?), FieldTypeTp::NewDecimal => Datum::Dec(self.get_decimal(idx)?), - FieldTypeTp::JSON => Datum::Json(self.get_json(idx)?), + FieldTypeTp::Json => Datum::Json(self.get_json(idx)?), FieldTypeTp::Enum => Datum::Enum(self.get_enum(idx)?), FieldTypeTp::Bit => Datum::Bytes(self.get_bytes(idx).to_vec()), FieldTypeTp::Set => { @@ -402,7 +402,8 @@ impl Column { self.null_cnt = 0; self.null_bitmap.clear(); if !self.var_offsets.is_empty() { - // The first offset is always 0, it makes slicing the data easier, we need to keep it. + // The first offset is always 0, it makes slicing the data easier, we need to + // keep it. self.var_offsets.truncate(1); } self.data.clear(); @@ -1006,7 +1007,7 @@ pub trait ChunkColumnEncoder: NumberEncoder { } // offsets if !col.is_fixed() { - //let length = (col.length+1)*4; + // let length = (col.length+1)*4; for v in &col.var_offsets { self.write_i64_le(*v as i64)?; } @@ -1141,7 +1142,7 @@ mod tests { #[test] fn test_column_json() { - let fields: Vec = vec![FieldTypeTp::JSON.into()]; + let fields: Vec = vec![FieldTypeTp::Json.into()]; let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); let data = vec![Datum::Null, Datum::Json(json)]; diff --git a/components/tidb_query_datatype/src/codec/collation/charset.rs b/components/tidb_query_datatype/src/codec/collation/charset.rs index 482e19cb999..9ea76f16b92 100644 --- a/components/tidb_query_datatype/src/codec/collation/charset.rs +++ b/components/tidb_query_datatype/src/codec/collation/charset.rs @@ -22,6 +22,10 @@ impl Charset for CharsetBinary { Some((data[0], 1)) } } + + fn charset() -> crate::Charset { + crate::Charset::Binary + } } pub struct CharsetUtf8mb4; @@ -48,6 +52,10 @@ impl Charset for CharsetUtf8mb4 { }) } } + + fn charset() -> crate::Charset { + crate::Charset::Utf8Mb4 + } } // gbk character data actually stored with utf8mb4 character encoding. diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs index 9c2dd2497f1..31685ca08d5 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs @@ -15,8 +15,8 @@ impl Collator for T { #[inline] fn char_weight(ch: char) -> Self::Weight { - // All GBK code point are in BMP, if the incoming character is not, convert it to '?'. - // This should not happened. + // All GBK code point are in BMP, if the incoming character is not, convert it + // to '?'. This should not happened. let r = ch as usize; if r > 0xFFFF { return '?' as u16; @@ -71,7 +71,8 @@ impl GbkCollator for CollatorGbkBin { const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_BIN_TABLE; } -/// Collator for `gbk_chinese_ci` collation with padding behavior (trims right spaces). +/// Collator for `gbk_chinese_ci` collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorGbkChineseCi; @@ -80,10 +81,12 @@ impl GbkCollator for CollatorGbkChineseCi { const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_CHINESE_CI_TABLE; } -// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding. -// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. +// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally +// the same with golang's GBK encoding. If there is no mapping code in GBK, use +// 0x3F(?) instead. It should not happened. const GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); // GBK_CHINESE_CI_TABLE are the sort key tables for GBK codepoint. -// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. +// If there is no mapping code in GBK, use 0x3F(?) instead. It should not +// happened. const GBK_CHINESE_CI_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_chinese_ci.data"); diff --git a/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs b/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs index c74ed3687a9..c70deb08cd1 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs @@ -4,7 +4,8 @@ use bstr::{ByteSlice, B}; use super::*; -/// Collator for latin1_bin collation with padding behavior (trims right spaces). +/// Collator for latin1_bin collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorLatin1Bin; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs index e12114d9cea..bac55eabea7 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs @@ -45,7 +45,8 @@ mod tests { (Collation::GbkChineseCi, 6), ]; let cases = vec![ - // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi]) + // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, + // Latin1, GBKBin, GbkChineseCi]) ( "a".as_bytes(), "a".as_bytes(), @@ -232,7 +233,8 @@ mod tests { (Collation::GbkChineseCi, 6), ]; let cases = vec![ - // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi]) + // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, + // GBKBin, GbkChineseCi]) ( "a", [ diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs index bbd7e60a047..959664b1854 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs @@ -2,7 +2,8 @@ use super::*; -/// Collator for utf8mb4_bin collation with padding behavior (trims right spaces). +/// Collator for utf8mb4_bin collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorUtf8Mb4Bin; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs index 50770550f19..2cc9a738372 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs @@ -2,7 +2,8 @@ use super::*; -/// Collator for utf8mb4_general_ci collation with padding behavior (trims right spaces). +/// Collator for utf8mb4_general_ci collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorUtf8Mb4GeneralCi; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs index 9bb44382f53..5a529d48144 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs @@ -2,7 +2,8 @@ use super::*; -/// Collator for `utf8mb4_unicode_ci` collation with padding behavior (trims right spaces). +/// Collator for `utf8mb4_unicode_ci` collation with padding behavior (trims +/// right spaces). #[derive(Debug)] pub struct CollatorUtf8Mb4UnicodeCi; diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs b/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs index fac8c8f3b58..be1b91ae1ea 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs @@ -20,7 +20,10 @@ impl Encoding for EncodingAscii { fn decode(data: BytesRef<'_>) -> Result { for x in data { if !x.is_ascii() { - return Err(Error::cannot_convert_string("ascii")); + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "ascii", + )); } } Ok(Bytes::from(data)) diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs index 26f61da7536..6f27475ff2c 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs @@ -6,14 +6,17 @@ use super::*; use crate::codec::data_type::{BytesGuard, BytesWriter}; #[derive(Debug)] -pub struct EncodingGBK; +pub struct EncodingGbk; -impl Encoding for EncodingGBK { +impl Encoding for EncodingGbk { #[inline] fn decode(data: BytesRef<'_>) -> Result { match GBK.decode_without_bom_handling_and_without_replacement(data) { Some(v) => Ok(Bytes::from(v.as_bytes())), - None => Err(Error::cannot_convert_string("gbk")), + None => Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gbk", + )), } } diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs index 2647446ab7f..b2434105ce5 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs @@ -15,3 +15,24 @@ use crate::codec::{ data_type::{Bytes, BytesRef}, Error, Result, }; + +fn format_invalid_char(data: BytesRef<'_>) -> String { + // Max length of the invalid string is '\x00\x00\x00\x00\x00...'(25) we set 32 + // here. + let mut buf = String::with_capacity(32); + const MAX_BYTES_TO_SHOW: usize = 5; + buf.push('\''); + for i in 0..data.len() { + if i > MAX_BYTES_TO_SHOW { + buf.push_str("..."); + break; + } + if data[i].is_ascii() { + buf.push(char::from(data[i])); + } else { + buf.push_str(format!("\\x{:X}", data[i]).as_str()); + } + } + buf.push('\''); + buf +} diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs b/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs index b1539e7c581..e83d6e3eb22 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs @@ -2,37 +2,40 @@ use super::*; -pub trait UTF8CompatibleEncoding { +pub trait Utf8CompatibleEncoding { const NAME: &'static str; } -impl Encoding for T { +impl Encoding for T { #[inline] fn decode(data: BytesRef<'_>) -> Result { match str::from_utf8(data) { Ok(v) => Ok(Bytes::from(v)), - Err(_) => Err(Error::cannot_convert_string(T::NAME)), + Err(_) => Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + T::NAME, + )), } } } #[derive(Debug)] -pub struct EncodingUTF8Mb4; +pub struct EncodingUtf8Mb4; -impl UTF8CompatibleEncoding for EncodingUTF8Mb4 { +impl Utf8CompatibleEncoding for EncodingUtf8Mb4 { const NAME: &'static str = "utf8mb4"; } #[derive(Debug)] -pub struct EncodingUTF8; +pub struct EncodingUtf8; -impl UTF8CompatibleEncoding for EncodingUTF8 { +impl Utf8CompatibleEncoding for EncodingUtf8 { const NAME: &'static str = "utf8"; } #[derive(Debug)] pub struct EncodingLatin1; -impl UTF8CompatibleEncoding for EncodingLatin1 { +impl Utf8CompatibleEncoding for EncodingLatin1 { const NAME: &'static str = "latin1"; } diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 7d73cce2192..9fbef4f1ee2 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -41,6 +41,32 @@ macro_rules! match_template_collator { }} } +#[macro_export] +macro_rules! match_template_multiple_collators { + ((), (), $($tail:tt)*) => { + $($tail)* + }; + (($first:tt), ($match_exprs:tt), $($tail:tt)*) => { + match_template_multiple_collators! { + ($first,), ($match_exprs,), $($tail)* + } + }; + (($first:tt, $($t:tt)*), ($first_match_expr:tt, $($match_exprs:tt)*), $($tail:tt)*) => {{ + #[allow(unused_imports)] + use $crate::codec::collation::collator::*; + + match_template_collator! { + $first, match $first_match_expr { + Collation::$first => { + match_template_multiple_collators! { + ($($t)*), ($($match_exprs)*), $($tail)* + } + } + } + } + }}; +} + #[macro_export] macro_rules! match_template_charset { ($t:tt, $($tail:tt)*) => {{ @@ -49,10 +75,10 @@ macro_rules! match_template_charset { match_template::match_template! { $t = [ - UTF8 => EncodingUTF8, - UTF8Mb4 => EncodingUTF8Mb4, + Utf8 => EncodingUtf8, + Utf8Mb4 => EncodingUtf8Mb4, Latin1 => EncodingLatin1, - GBK => EncodingGBK, + Gbk => EncodingGbk, Binary => EncodingBinary, Ascii => EncodingAscii, ], @@ -67,6 +93,8 @@ pub trait Charset { fn validate(bstr: &[u8]) -> Result<()>; fn decode_one(data: &[u8]) -> Option<(Self::Char, usize)>; + + fn charset() -> crate::Charset; } pub trait Collator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug { @@ -149,8 +177,9 @@ where /// /// # Panic /// - /// The `Ord`, `Hash`, `PartialEq` and more implementations assume that the bytes are - /// valid for the certain collator. The violation will cause panic. + /// The `Ord`, `Hash`, `PartialEq` and more implementations assume that the + /// bytes are valid for the certain collator. The violation will cause + /// panic. #[inline] pub fn new_unchecked(inner: T) -> Self { Self { diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 61ce14a0390..418841547ca 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -186,7 +186,7 @@ pub fn integer_signed_lower_bound(tp: FieldTypeTp) -> i64 { /// `truncate_binary` truncates a buffer to the specified length. #[inline] pub fn truncate_binary(s: &mut Vec, flen: isize) { - if flen != crate::UNSPECIFIED_LENGTH as isize && s.len() > flen as usize { + if flen != crate::UNSPECIFIED_LENGTH && s.len() > flen as usize { s.truncate(flen as usize); } } @@ -280,11 +280,13 @@ impl ToInt for u64 { impl ToInt for f64 { /// This function is ported from TiDB's types.ConvertFloatToInt, - /// which checks whether the number overflows the signed lower and upper boundaries of `tp` + /// which checks whether the number overflows the signed lower and upper + /// boundaries of `tp` /// /// # Notes /// - /// It handles overflows using `ctx` so that the caller would not handle it anymore. + /// It handles overflows using `ctx` so that the caller would not handle it + /// anymore. fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { #![allow(clippy::float_cmp)] let val = self.round(); @@ -307,11 +309,13 @@ impl ToInt for f64 { } /// This function is ported from TiDB's types.ConvertFloatToUint, - /// which checks whether the number overflows the unsigned upper boundaries of `tp` + /// which checks whether the number overflows the unsigned upper boundaries + /// of `tp` /// /// # Notes /// - /// It handles overflows using `ctx` so that the caller would not handle it anymore. + /// It handles overflows using `ctx` so that the caller would not handle it + /// anymore. #[allow(clippy::float_cmp)] fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let val = self.round(); @@ -427,7 +431,7 @@ impl ToInt for Decimal { fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let dec = round_decimal_with_ctx(ctx, *self)?; let val = dec.as_i64(); - let err = Error::truncated_wrong_val("DECIMAL", &dec); + let err = Error::truncated_wrong_val("DECIMAL", dec); let r = val.into_result_with_overflow_err(ctx, err)?; r.to_int(ctx, tp) } @@ -436,7 +440,7 @@ impl ToInt for Decimal { fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let dec = round_decimal_with_ctx(ctx, *self)?; let val = dec.as_u64(); - let err = Error::truncated_wrong_val("DECIMAL", &dec); + let err = Error::truncated_wrong_val("DECIMAL", dec); let r = val.into_result_with_overflow_err(ctx, err)?; r.to_uint(ctx, tp) } @@ -444,8 +448,12 @@ impl ToInt for Decimal { impl ToInt for DateTime { // FiXME - // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4).unwrap().round_frac(DEFAULT_FSP) - // will get 2000-01-01T12:13:14, this is a bug + // ``` + // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4) + // .unwrap() + // .round_frac(DEFAULT_FSP) + // ``` + // will get 2000-01-01T12:13:14, this is a bug #[inline] fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let t = self.round_frac(ctx, DEFAULT_FSP)?; @@ -502,14 +510,14 @@ impl<'a> ToInt for JsonRef<'a> { // TiDB: 5 // MySQL: 4 let val = match self.get_type() { - JsonType::Object | JsonType::Array => Ok(ctx - .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) - .map(|_| 0)?), JsonType::Literal => Ok(self.get_literal().map_or(0, |x| x as i64)), JsonType::I64 => Ok(self.get_i64()), JsonType::U64 => Ok(self.get_u64() as i64), JsonType::Double => self.get_double().to_int(ctx, tp), JsonType::String => self.get_str_bytes()?.to_int(ctx, tp), + _ => Ok(ctx + .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) + .map(|_| 0)?), }?; val.to_int(ctx, tp) } @@ -518,14 +526,14 @@ impl<'a> ToInt for JsonRef<'a> { #[inline] fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let val = match self.get_type() { - JsonType::Object | JsonType::Array => Ok(ctx - .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) - .map(|_| 0)?), JsonType::Literal => Ok(self.get_literal().map_or(0, |x| x as u64)), JsonType::I64 => Ok(self.get_i64() as u64), JsonType::U64 => Ok(self.get_u64()), JsonType::Double => self.get_double().to_uint(ctx, tp), JsonType::String => self.get_str_bytes()?.to_uint(ctx, tp), + _ => Ok(ctx + .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) + .map(|_| 0)?), }?; val.to_uint(ctx, tp) } @@ -631,7 +639,7 @@ pub fn produce_dec_with_specified_tp( // select (cast 111 as decimal(1)) causes a warning in MySQL. ctx.handle_overflow_err(Error::overflow( "Decimal", - &format!("({}, {})", flen, decimal), + format!("({}, {})", flen, decimal), ))?; dec = max_or_min_dec(dec.is_negative(), flen as u8, decimal as u8) } else if frac != decimal { @@ -640,7 +648,7 @@ pub fn produce_dec_with_specified_tp( .round(decimal as i8, RoundMode::HalfEven) .into_result_with_overflow_err( ctx, - Error::overflow("Decimal", &format!("({}, {})", flen, decimal)), + Error::overflow("Decimal", format!("({}, {})", flen, decimal)), )?; if !rounded.is_zero() && frac > decimal && rounded != old { if ctx.cfg.flag.contains(Flag::IN_INSERT_STMT) @@ -664,8 +672,8 @@ pub fn produce_dec_with_specified_tp( } } -/// `produce_float_with_specified_tp`(`ProduceFloatWithSpecifiedTp` in TiDB) produces -/// a new float64 according to `flen` and `decimal` in `self.tp`. +/// `produce_float_with_specified_tp`(`ProduceFloatWithSpecifiedTp` in TiDB) +/// produces a new float64 according to `flen` and `decimal` in `self.tp`. /// TODO port tests from TiDB(TiDB haven't implemented now) pub fn produce_float_with_specified_tp( ctx: &mut EvalContext, @@ -692,8 +700,8 @@ pub fn produce_float_with_specified_tp( Ok(res) } -/// `produce_str_with_specified_tp`(`ProduceStrWithSpecifiedTp` in TiDB) produces -/// a new string according to `flen` and `chs`. +/// `produce_str_with_specified_tp`(`ProduceStrWithSpecifiedTp` in TiDB) +/// produces a new string according to `flen` and `chs`. pub fn produce_str_with_specified_tp<'a>( ctx: &mut EvalContext, s: Cow<'a, [u8]>, @@ -705,8 +713,8 @@ pub fn produce_str_with_specified_tp<'a>( return Ok(s); } let flen = flen as usize; - // flen is the char length, not byte length, for UTF8 charset, we need to calculate the - // char count and truncate to flen chars if it is too long. + // flen is the char length, not byte length, for UTF8 charset, we need to + // calculate the char count and truncate to flen chars if it is too long. if chs == charset::CHARSET_UTF8 || chs == charset::CHARSET_UTF8MB4 { let (char_count, truncate_pos) = { let s = &String::from_utf8_lossy(&s); @@ -767,7 +775,8 @@ pub fn pad_zero_for_binary_type(s: &mut Vec, ft: &FieldType) { .unwrap_or(false) && s.len() < flen { - // it seems MaxAllowedPacket has not push down to tikv, so we needn't to handle it + // it seems MaxAllowedPacket has not push down to tikv, so we needn't to handle + // it s.resize(flen, 0); } } @@ -802,7 +811,7 @@ impl ConvertTo for &[u8] { .map_err(|err| -> Error { box_err!("Parse '{}' to float err: {:?}", vs, err) })?; // The `parse` will return Ok(inf) if the float string literal out of range if val.is_infinite() { - ctx.handle_truncate_err(Error::truncated_wrong_val("DOUBLE", &vs))?; + ctx.handle_truncate_err(Error::truncated_wrong_val("DOUBLE", vs))?; if val.is_sign_negative() { return Ok(f64::MIN); } else { @@ -828,7 +837,17 @@ impl ConvertTo for Bytes { } pub fn get_valid_int_prefix<'a>(ctx: &mut EvalContext, s: &'a str) -> Result> { - if !ctx.cfg.flag.contains(Flag::IN_SELECT_STMT) { + get_valid_int_prefix_helper(ctx, s, false) +} + +// As TiDB code(getValidIntPrefix()), cast expr needs to give error/warning when +// input string is like float. +pub fn get_valid_int_prefix_helper<'a>( + ctx: &mut EvalContext, + s: &'a str, + is_cast_func: bool, +) -> Result> { + if !is_cast_func { let vs = get_valid_float_prefix(ctx, s)?; Ok(float_str_to_int_string(ctx, vs)) } else { @@ -855,51 +874,65 @@ pub fn get_valid_int_prefix<'a>(ctx: &mut EvalContext, s: &'a str) -> Result(ctx: &mut EvalContext, s: &'a str) -> Result<&'a str> { - let mut saw_dot = false; - let mut saw_digit = false; - let mut valid_len = 0; - let mut e_idx = 0; - for (i, c) in s.chars().enumerate() { - if c == '+' || c == '-' { - if i != 0 && (e_idx == 0 || i != e_idx + 1) { - // "1e+1" is valid. - break; - } - } else if c == '.' { - if saw_dot || e_idx > 0 { - // "1.1." or "1e1.1" + get_valid_float_prefix_helper(ctx, s, false) +} + +// As TiDB code(getValidFloatPrefix()), cast expr should not give error/warning +// when input is empty. +pub fn get_valid_float_prefix_helper<'a>( + ctx: &mut EvalContext, + s: &'a str, + is_cast_func: bool, +) -> Result<&'a str> { + if is_cast_func && s.is_empty() { + Ok("0") + } else { + let mut saw_dot = false; + let mut saw_digit = false; + let mut valid_len = 0; + let mut e_idx = 0; + for (i, c) in s.chars().enumerate() { + if c == '+' || c == '-' { + if i != 0 && (e_idx == 0 || i != e_idx + 1) { + // "1e+1" is valid. + break; + } + } else if c == '.' { + if saw_dot || e_idx > 0 { + // "1.1." or "1e1.1" + break; + } + saw_dot = true; + if saw_digit { + // "123." is valid. + valid_len = i + 1; + } + } else if c == 'e' || c == 'E' { + if !saw_digit { + // "+.e" + break; + } + if e_idx != 0 { + // "1e5e" + break; + } + e_idx = i + } else if !('0'..='9').contains(&c) { break; - } - saw_dot = true; - if saw_digit { - // "123." is valid. + } else { + saw_digit = true; valid_len = i + 1; } - } else if c == 'e' || c == 'E' { - if !saw_digit { - // "+.e" - break; - } - if e_idx != 0 { - // "1e5e" - break; - } - e_idx = i - } else if !('0'..='9').contains(&c) { - break; + } + if valid_len == 0 || valid_len < s.len() { + ctx.handle_truncate_err(Error::truncated_wrong_val("INTEGER", s))?; + } + if valid_len == 0 { + Ok("0") } else { - saw_digit = true; - valid_len = i + 1; + Ok(&s[..valid_len]) } } - if valid_len == 0 || valid_len < s.len() { - ctx.handle_truncate_err(Error::truncated_wrong_val("INTEGER", s))?; - } - if valid_len == 0 { - Ok("0") - } else { - Ok(&s[..valid_len]) - } } /// the `s` must be a valid int_str @@ -937,14 +970,14 @@ fn round_int_str(num_next_dot: char, s: &str) -> Cow<'_, str> { } /// It converts a valid float string into valid integer string which can be -/// parsed by `i64::from_str`, we can't parse float first then convert it to string -/// because precision will be lost. +/// parsed by `i64::from_str`, we can't parse float first then convert it to +/// string because precision will be lost. /// /// When the float string indicating a value that is overflowing the i64, /// the original float string is returned and an overflow warning is attached. /// -/// This func will find serious overflow such as the len of result > 20 (without prefix `+/-`) -/// however, it will not check whether the result overflow BIGINT. +/// This func will find serious overflow such as the len of result > 20 (without +/// prefix `+/-`) however, it will not check whether the result overflow BIGINT. fn float_str_to_int_string<'a>(ctx: &mut EvalContext, valid_float: &'a str) -> Cow<'a, str> { // this func is complex, to make it same as TiDB's version, // we impl it like TiDB's version(https://github.com/pingcap/tidb/blob/9b521342bf/types/convert.go#L400) @@ -1003,7 +1036,7 @@ fn exp_float_str_to_int_str<'a>( // And the intCnt may contain the len of `+/-`, // so here we use 21 here as the early detection. ctx.warnings - .append_warning(Error::overflow("BIGINT", &valid_float)); + .append_warning(Error::overflow("BIGINT", valid_float)); return Cow::Borrowed(valid_float); } if int_cnt <= 0 { @@ -1507,7 +1540,8 @@ mod tests { ("{}", ERR_TRUNCATE_WRONG_VALUE), ("[]", ERR_TRUNCATE_WRONG_VALUE), ]; - // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as true + // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as + // true let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); for (jstr, exp) in test_cases { let json: Json = jstr.parse().unwrap(); @@ -1555,7 +1589,7 @@ mod tests { // SHOULD_CLIP_TO_ZERO let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::IN_INSERT_STMT))); let r = (-12345_i64).to_uint(&mut ctx, FieldTypeTp::LongLong); - assert!(r.is_err()); + r.unwrap_err(); // SHOULD_CLIP_TO_ZERO | OVERFLOW_AS_WARNING let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag( @@ -1841,7 +1875,8 @@ mod tests { ("{}", ERR_TRUNCATE_WRONG_VALUE), ("[]", ERR_TRUNCATE_WRONG_VALUE), ]; - // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as true + // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as + // true let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); for (jstr, exp) in test_cases { let json: Json = jstr.parse().unwrap(); @@ -1893,11 +1928,11 @@ mod tests { // test overflow let mut ctx = EvalContext::default(); let val: Result = f64::INFINITY.to_string().as_bytes().convert(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); let mut ctx = EvalContext::default(); let val: Result = f64::NEG_INFINITY.to_string().as_bytes().convert(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); // TRUNCATE_AS_WARNING let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); @@ -1930,20 +1965,17 @@ mod tests { let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); let val: Result = b"".to_vec().convert(&mut ctx); - assert!(val.is_ok()); assert_eq!(val.unwrap(), 0.0); assert_eq!(ctx.warnings.warnings.len(), 1); let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); let val: Result = b"1.1a".to_vec().convert(&mut ctx); - assert!(val.is_ok()); assert_eq!(val.unwrap(), 1.1); assert_eq!(ctx.warnings.warnings.len(), 1); // IGNORE_TRUNCATE let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::IGNORE_TRUNCATE))); let val: Result = b"1.2a".to_vec().convert(&mut ctx); - assert!(val.is_ok()); assert_eq!(val.unwrap(), 1.2); assert_eq!(ctx.warnings.warnings.len(), 0); } @@ -1984,28 +2016,48 @@ mod tests { fn test_get_valid_float_prefix() { let cases = vec![ ("-100", "-100"), + ("1.", "1."), + (".1", ".1"), + ("123.23E-10", "123.23E-10"), + ]; + + let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag( + Flag::TRUNCATE_AS_WARNING | Flag::OVERFLOW_AS_WARNING, + ))); + for (i, o) in cases { + assert_eq!(super::get_valid_float_prefix(&mut ctx, i).unwrap(), o); + } + assert_eq!(ctx.take_warnings().warnings.len(), 0); + + let warning_cases = vec![ ("1abc", "1"), ("-1-1", "-1"), ("+1+1", "+1"), ("123..34", "123."), - ("123.23E-10", "123.23E-10"), ("1.1e1.3", "1.1e1"), ("11e1.3", "11e1"), ("1.1e-13a", "1.1e-13"), - ("1.", "1."), - (".1", ".1"), - ("", "0"), ("123e+", "123"), ("123.e", "123."), ("1-1-", "1"), ("11-1-", "11"), ("-1-1-", "-1"), + ("", "0"), ]; - - let mut ctx = EvalContext::new(Arc::new(EvalConfig::default_for_test())); - for (i, o) in cases { + let warning_cnt = warning_cases.len(); + for (i, o) in warning_cases.clone() { assert_eq!(super::get_valid_float_prefix(&mut ctx, i).unwrap(), o); } + assert_eq!(ctx.take_warnings().warnings.len(), warning_cnt); + + // Test is cast expr. + for (i, o) in warning_cases.clone() { + assert_eq!( + super::get_valid_float_prefix_helper(&mut ctx, i, true).unwrap(), + o + ); + } + assert_eq!(ctx.take_warnings().warnings.len(), warning_cnt - 1); } #[test] @@ -2045,7 +2097,8 @@ mod tests { assert_eq!(o.unwrap(), i); } - // Secondly, make sure warnings are attached when the float string cannot be casted to a valid int string + // Secondly, make sure warnings are attached when the float string cannot be + // casted to a valid int string let warnings = ctx.take_warnings().warnings; assert_eq!(warnings.len(), 2); for warning in warnings { @@ -2093,11 +2146,8 @@ mod tests { } assert_eq!(ctx.take_warnings().warnings.len(), 0); - let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag( - Flag::IN_SELECT_STMT | Flag::IGNORE_TRUNCATE | Flag::OVERFLOW_AS_WARNING, - ))); + let mut ctx = EvalContext::new(Arc::new(EvalConfig::default_for_test())); let cases = vec![ - ("+0.0", "+0"), ("100", "100"), ("+100", "+100"), ("-100", "-100"), @@ -2108,10 +2158,18 @@ mod tests { ]; for (i, e) in cases { - let o = super::get_valid_int_prefix(&mut ctx, i); + let o = super::get_valid_int_prefix_helper(&mut ctx, i, true); assert_eq!(o.unwrap(), *e, "{}, {}", i, e); } assert_eq!(ctx.take_warnings().warnings.len(), 0); + + let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); + let cases = vec![("+0.0", "+0"), ("0.5", "0"), ("+0.5", "+0")]; + for (i, e) in cases { + let o = super::get_valid_int_prefix_helper(&mut ctx, i, true); + assert_eq!(o.unwrap(), *e, "{}, {}", i, e); + } + assert_eq!(ctx.take_warnings().warnings.len(), 3); } #[test] @@ -2295,9 +2353,7 @@ mod tests { for (dec, flen, decimal, want) in cases { ft.set_flen(flen); ft.set_decimal(decimal); - let nd = produce_dec_with_specified_tp(&mut ctx, dec, &ft); - assert!(nd.is_ok()); - let nd = nd.unwrap(); + let nd = produce_dec_with_specified_tp(&mut ctx, dec, &ft).unwrap(); assert_eq!(nd, want, "{}, {}, {}, {}, {}", dec, nd, want, flen, decimal); } } @@ -2310,8 +2366,8 @@ mod tests { // origin, // (origin_flen, origin_decimal), (res_flen, res_decimal), is_unsigned, // expect, warning_err_code, - // ((InInsertStmt || InUpdateStmt || InDeleteStmt), overflow_as_warning, truncate_as_warning) - // ) + // ((InInsertStmt || InUpdateStmt || InDeleteStmt), overflow_as_warning, + // truncate_as_warning) ) // // The origin_flen, origin_decimal field is to // let the programmer clearly know what the flen and decimal of the decimal is. @@ -2597,7 +2653,8 @@ mod tests { // zero // FIXME: // according to Decimal::prec_and_frac, - // the decimals' prec(the number of all digits) and frac(the number of digit after number point) are + // the decimals' prec(the number of all digits) and frac(the number of digit after + // number point) are: // Decimal::zero()'s is (1, 0) // Decimal::from_bytes(b"00.00")'s is (2, 2) // Decimal::from_bytes(b"000.00")'s is (2, 2) diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs index 7086e97c23b..c4f5abbc122 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs @@ -13,11 +13,11 @@ pub struct ChunkedVecBytes { /// A vector storing `Option` with a compact layout. /// -/// Inside `ChunkedVecBytes`, `bitmap` indicates if an element at given index is null, -/// and `data` stores actual data. Bytes data are stored adjacent to each other in -/// `data`. If element at a given index is null, then it takes no space in `data`. -/// Otherwise, contents of the `Bytes` are stored, and `var_offset` indicates the starting -/// position of each element. +/// Inside `ChunkedVecBytes`, `bitmap` indicates if an element at given index is +/// null, and `data` stores actual data. Bytes data are stored adjacent to each +/// other in `data`. If element at a given index is null, then it takes no space +/// in `data`. Otherwise, contents of the `Bytes` are stored, and `var_offset` +/// indicates the starting position of each element. impl ChunkedVecBytes { #[inline] pub fn push_data_ref(&mut self, value: BytesRef<'_>) { @@ -177,7 +177,7 @@ impl BytesWriter { } } -impl<'a> PartialBytesWriter { +impl PartialBytesWriter { pub fn partial_write(&mut self, data: BytesRef<'_>) { self.chunked_vec.data.extend_from_slice(data); } diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs index 52279c5a439..9ef17dc61eb 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs @@ -7,11 +7,12 @@ use crate::impl_chunked_vec_common; /// A vector storing `Option` with a compact layout. /// -/// Inside `ChunkedVecJson`, `bitmap` indicates if an element at given index is null, -/// and `data` stores actual data. Json data are stored adjacent to each other in -/// `data`. If element at a given index is null, then it takes no space in `data`. -/// Otherwise, a one byte `json_type` and variable size json data is stored in `data`, -/// and `var_offset` indicates the starting position of each element. +/// Inside `ChunkedVecJson`, `bitmap` indicates if an element at given index is +/// null, and `data` stores actual data. Json data are stored adjacent to each +/// other in `data`. If element at a given index is null, then it takes no space +/// in `data`. Otherwise, a one byte `json_type` and variable size json data is +/// stored in `data`, and `var_offset` indicates the starting position of each +/// element. #[derive(Debug, PartialEq, Clone)] pub struct ChunkedVecJson { data: Vec, diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs index 41b523391c2..1a3f6838e96 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs @@ -20,7 +20,8 @@ use crate::impl_chunked_vec_common; /// stored representation issue /// /// TODO: add way to set set column data -/// TODO: code fot set/enum looks nearly the same, considering refactor them using macro +/// TODO: code fot set/enum looks nearly the same, considering refactor them +/// using macro #[derive(Debug, Clone)] pub struct ChunkedVecSet { data: Arc, diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs index 45e2665ec31..4f614d00be0 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs @@ -9,10 +9,11 @@ use crate::impl_chunked_vec_common; /// in that structure itself. This includes `Int`, `Real`, `Decimal`, /// `DateTime` and `Duration` in copr framework. /// -/// Inside `ChunkedVecSized`, `bitmap` indicates if an element at given index is null, -/// and `data` stores actual data. If the element at given index is null (or `None`), -/// the corresponding `bitmap` bit is false, and `data` stores zero value for -/// that element. Otherwise, `data` stores actual data, and `bitmap` bit is true. +/// Inside `ChunkedVecSized`, `bitmap` indicates if an element at given index is +/// null, and `data` stores actual data. If the element at given index is null +/// (or `None`), the corresponding `bitmap` bit is false, and `data` stores zero +/// value for that element. Otherwise, `data` stores actual data, and `bitmap` +/// bit is true. #[derive(Debug, PartialEq, Clone)] pub struct ChunkedVecSized { data: Vec, diff --git a/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs b/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs index d27a030b817..46b5a64b010 100644 --- a/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs +++ b/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -// TODO: This value is chosen based on MonetDB/X100's research without our own benchmarks. +// TODO: This value is chosen based on MonetDB/X100's research without our own +// benchmarks. pub const BATCH_MAX_SIZE: usize = 1024; /// Identical logical row is a special case in expression evaluation that diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index 8397a8d2ab5..8ca36790824 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -50,51 +50,51 @@ pub use crate::codec::mysql::{ }; use crate::{codec::convert::ConvertTo, expr::EvalContext, EvalType}; -/// A trait of evaluating current concrete eval type into a MySQL logic value, represented by -/// Rust's `bool` type. -pub trait AsMySQLBool { +/// A trait of evaluating current concrete eval type into a MySQL logic value, +/// represented by Rust's `bool` type. +pub trait AsMySqlBool { /// Evaluates into a MySQL logic value. fn as_mysql_bool(&self, context: &mut EvalContext) -> Result; } -impl AsMySQLBool for Int { +impl AsMySqlBool for Int { #[inline] fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(*self != 0) } } -impl AsMySQLBool for Real { +impl AsMySqlBool for Real { #[inline] fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(self.into_inner() != 0f64) } } -impl<'a, T: AsMySQLBool> AsMySQLBool for &'a T { +impl<'a, T: AsMySqlBool> AsMySqlBool for &'a T { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { (**self).as_mysql_bool(context) } } -impl AsMySQLBool for Bytes { +impl AsMySqlBool for Bytes { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { self.as_slice().as_mysql_bool(context) } } -impl<'a> AsMySQLBool for BytesRef<'a> { +impl<'a> AsMySqlBool for BytesRef<'a> { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { Ok(!self.is_empty() && ConvertTo::::convert(self, context)? != 0f64) } } -impl<'a, T> AsMySQLBool for Option<&'a T> +impl<'a, T> AsMySqlBool for Option<&'a T> where - T: AsMySQLBool, + T: AsMySqlBool, { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { @@ -104,25 +104,25 @@ where } } -impl<'a> AsMySQLBool for JsonRef<'a> { +impl<'a> AsMySqlBool for JsonRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_zero()) } } -impl<'a> AsMySQLBool for EnumRef<'a> { +impl<'a> AsMySqlBool for EnumRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_empty()) } } -impl<'a> AsMySQLBool for SetRef<'a> { +impl<'a> AsMySqlBool for SetRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_empty()) } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -131,7 +131,7 @@ impl<'a> AsMySQLBool for Option> { } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -140,7 +140,7 @@ impl<'a> AsMySQLBool for Option> { } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -149,7 +149,7 @@ impl<'a> AsMySQLBool for Option> { } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -187,27 +187,28 @@ pub trait Evaluable: Clone + std::fmt::Debug + Send + Sync + 'static { /// panics if the varient mismatches. fn borrow_scalar_value_ref(v: ScalarValueRef<'_>) -> Option<&Self>; - /// Borrows a slice of this concrete type from a `VectorValue` in the same type; - /// panics if the varient mismatches. + /// Borrows a slice of this concrete type from a `VectorValue` in the same + /// type; panics if the varient mismatches. fn borrow_vector_value(v: &VectorValue) -> &ChunkedVecSized; } pub trait EvaluableRet: Clone + std::fmt::Debug + Send + Sync + 'static { const EVAL_TYPE: EvalType; type ChunkedType: ChunkedVec; - /// Converts a vector of this concrete type into a `VectorValue` in the same type; - /// panics if the varient mismatches. + /// Converts a vector of this concrete type into a `VectorValue` in the same + /// type; panics if the varient mismatches. fn cast_chunk_into_vector_value(vec: Self::ChunkedType) -> VectorValue; } /// # Notes /// -/// Make sure operating `bitmap` and `value` together, so while `bitmap` is 0 and the -/// corresponding value is None. +/// Make sure operating `bitmap` and `value` together, so while `bitmap` is 0 +/// and the corresponding value is None. /// /// With this guaranty, we can avoid the following issue: /// -/// For Data [Some(1), Some(2), None], we could have different stored representation: +/// For Data [Some(1), Some(2), None], we could have different stored +/// representation: /// /// Bitmap: 110, Value: 1, 2, 0 /// Bitmap: 110, Value: 1, 2, 1 @@ -368,8 +369,8 @@ pub trait EvaluableRef<'a>: Clone + std::fmt::Debug + Send + Sync { /// panics if the varient mismatches. fn borrow_scalar_value_ref(v: ScalarValueRef<'a>) -> Option; - /// Borrows a slice of this concrete type from a `VectorValue` in the same type; - /// panics if the varient mismatches. + /// Borrows a slice of this concrete type from a `VectorValue` in the same + /// type; panics if the varient mismatches. fn borrow_vector_value(v: &'a VectorValue) -> Self::ChunkedType; /// Convert this reference to owned type @@ -409,7 +410,7 @@ impl<'a, T: Evaluable + EvaluableRet> EvaluableRef<'a> for &'a T { } } -impl<'a, A: UnsafeRefInto, B> UnsafeRefInto> for Option { +impl, B> UnsafeRefInto> for Option { unsafe fn unsafe_into(self) -> Option { self.map(|x| x.unsafe_into()) } @@ -697,7 +698,7 @@ mod tests { .as_bytes() .to_vec() .as_mysql_bool(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); let mut ctx = EvalContext::default(); let val: Result = f64::NEG_INFINITY @@ -705,7 +706,7 @@ mod tests { .as_bytes() .to_vec() .as_mysql_bool(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); } #[test] diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index 7bf36935f3b..c74423107e4 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -13,17 +13,19 @@ use crate::{ /// A scalar value container, a.k.a. datum, for all concrete eval types. /// -/// In many cases, for example, at the framework level, the concrete eval type is unknown at compile -/// time. So we use this enum container to represent types dynamically. It is similar to trait -/// object `Box` where `T` is a concrete eval type but faster. +/// In many cases, for example, at the framework level, the concrete eval type +/// is unknown at compile time. So we use this enum container to represent types +/// dynamically. It is similar to trait object `Box` where `T` is a concrete +/// eval type but faster. /// /// Like `VectorValue`, the inner concrete value is immutable. /// /// Compared to `VectorValue`, it only contains a single concrete value. -/// Compared to `Datum`, it is a newer encapsulation that naturally wraps `Option<..>`. +/// Compared to `Datum`, it is a newer encapsulation that naturally wraps +/// `Option<..>`. /// -/// TODO: Once we removed the `Option<..>` wrapper, it will be much like `Datum`. At that time, -/// we only need to preserve one of them. +/// TODO: Once we removed the `Option<..>` wrapper, it will be much like +/// `Datum`. At that time, we only need to preserve one of them. #[derive(Clone, Debug, PartialEq)] pub enum ScalarValue { Int(Option), @@ -81,7 +83,7 @@ impl ScalarValue { } } -impl AsMySQLBool for ScalarValue { +impl AsMySqlBool for ScalarValue { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match_template_evaltype! { @@ -160,6 +162,14 @@ impl From for ScalarValue { } } +impl From<&str> for ScalarValue { + #[inline] + fn from(s: &str) -> ScalarValue { + let bytes = Bytes::from(s); + ScalarValue::Bytes(Some(bytes)) + } +} + impl From for Option { #[inline] fn from(s: ScalarValue) -> Option { @@ -170,7 +180,8 @@ impl From for Option { } } -/// A scalar value reference container. Can be created from `ScalarValue` or `VectorValue`. +/// A scalar value reference container. Can be created from `ScalarValue` or +/// `VectorValue`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum ScalarValueRef<'a> { Int(Option<&'a super::Int>), @@ -398,6 +409,34 @@ impl_as_ref! { Decimal, as_decimal } impl_as_ref! { DateTime, as_date_time } impl_as_ref! { Duration, as_duration } +impl ScalarValue { + #[inline] + pub fn as_enum(&self) -> Option> { + match self { + ScalarValue::Enum(x) => x.as_ref().map(|x| x.as_ref()), + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(Int), + ), + } + } +} + +impl ScalarValue { + #[inline] + pub fn as_set(&self) -> Option> { + match self { + ScalarValue::Set(x) => x.as_ref().map(|x| x.as_ref()), + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(Int), + ), + } + } +} + impl ScalarValue { #[inline] pub fn as_json(&self) -> Option> { diff --git a/components/tidb_query_datatype/src/codec/data_type/vector.rs b/components/tidb_query_datatype/src/codec/data_type/vector.rs index d26067d8219..49a4e3a1cff 100644 --- a/components/tidb_query_datatype/src/codec/data_type/vector.rs +++ b/components/tidb_query_datatype/src/codec/data_type/vector.rs @@ -8,8 +8,8 @@ use crate::{ /// A vector value container, a.k.a. column, for all concrete eval types. /// -/// The inner concrete value is immutable. However it is allowed to push and remove values from -/// this vector container. +/// The inner concrete value is immutable. However it is allowed to push and +/// remove values from this vector container. #[derive(Debug, PartialEq, Clone)] pub enum VectorValue { Int(ChunkedVecSized), @@ -25,8 +25,8 @@ pub enum VectorValue { } impl VectorValue { - /// Creates an empty `VectorValue` according to `eval_tp` and reserves capacity according - /// to `capacity`. + /// Creates an empty `VectorValue` according to `eval_tp` and reserves + /// capacity according to `capacity`. #[inline] pub fn with_capacity(capacity: usize, eval_tp: EvalType) -> Self { match_template_evaltype! { @@ -116,9 +116,11 @@ impl VectorValue { self.len() == 0 } - /// Shortens the column, keeping the first `len` datums and dropping the rest. + /// Shortens the column, keeping the first `len` datums and dropping the + /// rest. /// - /// If `len` is greater than the column's current length, this has no effect. + /// If `len` is greater than the column's current length, this has no + /// effect. #[inline] pub fn truncate(&mut self, len: usize) { match_template_evaltype! { @@ -134,7 +136,8 @@ impl VectorValue { self.truncate(0); } - /// Returns the number of elements this column can hold without reallocating. + /// Returns the number of elements this column can hold without + /// reallocating. #[inline] pub fn capacity(&self) -> usize { match_template_evaltype! { @@ -165,7 +168,8 @@ impl VectorValue { /// Evaluates values into MySQL logic values. /// - /// The caller must provide an output buffer which is large enough for holding values. + /// The caller must provide an output buffer which is large enough for + /// holding values. pub fn eval_as_mysql_bools( &self, ctx: &mut EvalContext, @@ -362,7 +366,7 @@ impl VectorValue { output.write_evaluable_datum_null()?; } Some(val) => { - output.write_evaluable_datum_decimal(*val)?; + output.write_evaluable_datum_decimal(val)?; } } Ok(()) @@ -464,7 +468,8 @@ impl VectorValue { macro_rules! impl_as_slice { ($ty:tt, $name:ident) => { impl VectorValue { - /// Extracts a slice of values in specified concrete type from current column. + /// Extracts a slice of values in specified concrete type from current + /// column. /// /// # Panics /// @@ -494,8 +499,9 @@ impl_as_slice! { Json, to_json_vec } impl_as_slice! { Enum, to_enum_vec } impl_as_slice! { Set, to_set_vec } -/// Additional `VectorValue` methods available via generics. These methods support different -/// concrete types but have same names and should be specified via the generic parameter type. +/// Additional `VectorValue` methods available via generics. These methods +/// support different concrete types but have same names and should be specified +/// via the generic parameter type. pub trait VectorValueExt { /// The generic version for `VectorValue::push_xxx()`. fn push(&mut self, v: Option); diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index a1cc6460ae2..dde98003475 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -24,7 +24,7 @@ use super::{ use crate::{ codec::{ convert::{ConvertTo, ToInt}, - data_type::AsMySQLBool, + data_type::AsMySqlBool, }, expr::EvalContext, FieldTypeTp, @@ -162,7 +162,8 @@ pub fn cmp_f64(l: f64, r: f64) -> Result { .ok_or_else(|| invalid_type!("{} and {} can't be compared", l, r)) } -/// `checked_add_i64` checks and adds `r` to the `l`. Return None if the sum is negative. +/// `checked_add_i64` checks and adds `r` to the `l`. Return None if the sum is +/// negative. #[inline] fn checked_add_i64(l: u64, r: i64) -> Option { if r >= 0 { @@ -908,8 +909,8 @@ pub trait DatumDecoder: NIL_FLAG => Datum::Null, FLOAT_FLAG => self.read_f64().map(Datum::F64)?, DURATION_FLAG => { - // Decode the i64 into `Duration` with `MAX_FSP`, then unflatten it with concrete - // `FieldType` information + // Decode the i64 into `Duration` with `MAX_FSP`, then unflatten it with + // concrete `FieldType` information let nanos = self.read_i64()?; let dur = Duration::from_nanos(nanos, MAX_FSP)?; Datum::Dur(dur) @@ -1010,7 +1011,7 @@ pub trait DatumEncoder: self.write_u8(JSON_FLAG)?; self.write_json(j.as_ref())?; } - //TODO: implement datum write here. + // TODO: implement datum write here. Datum::Enum(_) => unimplemented!(), Datum::Set(_) => unimplemented!(), } @@ -1073,7 +1074,8 @@ pub fn encode(ctx: &mut EvalContext, values: &[Datum], comparable: bool) -> Resu Ok(buf) } -/// `encode_key` encodes a datum slice into a memory comparable buffer as the key. +/// `encode_key` encodes a datum slice into a memory comparable buffer as the +/// key. pub fn encode_key(ctx: &mut EvalContext, values: &[Datum]) -> Result> { encode(ctx, values, true) } @@ -1134,7 +1136,8 @@ pub fn split_datum(buf: &[u8], desc: bool) -> Result<(&[u8], &[u8])> { /// `skip_n_datum_slices` skip `n` datum slices within `buf` /// and advances the buffer pointer. -/// If the datum buffer contains less than `n` slices, an error will be returned. +/// If the datum buffer contains less than `n` slices, an error will be +/// returned. pub fn skip_n(buf: &mut &[u8], n: usize) -> Result<()> { let origin = *buf; for i in 0..n { @@ -1957,7 +1960,7 @@ mod tests { ), (Datum::Bytes(b"[1, 2, 3]".to_vec()), "[1, 2, 3]"), (Datum::Bytes(b"{}".to_vec()), "{}"), - (Datum::I64(1), "true"), + (Datum::I64(1), "1"), ]; for (d, json) in tests { @@ -1972,7 +1975,7 @@ mod tests { ]; for d in illegal_cases { - assert!(d.cast_as_json().is_err()); + d.cast_as_json().unwrap_err(); } } @@ -1993,7 +1996,7 @@ mod tests { let illegal_cases = vec![Datum::Max, Datum::Min]; for d in illegal_cases { - assert!(d.into_json().is_err()); + d.into_json().unwrap_err(); } } diff --git a/components/tidb_query_datatype/src/codec/datum_codec.rs b/components/tidb_query_datatype/src/codec/datum_codec.rs index 6710029ec99..9d3f5058d0b 100644 --- a/components/tidb_query_datatype/src/codec/datum_codec.rs +++ b/components/tidb_query_datatype/src/codec/datum_codec.rs @@ -1,7 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! The unified entry for encoding and decoding an evaluable type to / from datum bytes. -//! Datum bytes consists of 1 byte datum flag and variable bytes datum payload. +//! The unified entry for encoding and decoding an evaluable type to / from +//! datum bytes. Datum bytes consists of 1 byte datum flag and variable bytes +//! datum payload. use codec::prelude::*; use tipb::FieldType; diff --git a/components/tidb_query_datatype/src/codec/error.rs b/components/tidb_query_datatype/src/codec/error.rs index 9cb0ee50d18..785424b31ca 100644 --- a/components/tidb_query_datatype/src/codec/error.rs +++ b/components/tidb_query_datatype/src/codec/error.rs @@ -95,8 +95,8 @@ impl Error { } } - pub fn cannot_convert_string(charset: &str) -> Error { - let msg = format!("cannot convert string from binary to {}", charset); + pub fn cannot_convert_string(s: &str, charset: &str) -> Error { + let msg = format!("Cannot convert string {} from binary to {}", s, charset); Error::Eval(msg, ERR_CANNOT_CONVERT_STRING) } @@ -145,6 +145,10 @@ impl Error { ); Error::Eval(msg, ERR_INCORRECT_PARAMETERS) } + + pub fn regexp_error(msg: String) -> Error { + Error::Eval(msg, ERR_REGEXP) + } } impl From for tipb::Error { diff --git a/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs b/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs index 9904ead1098..3ab44ad40df 100644 --- a/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs @@ -44,9 +44,10 @@ pub fn to_uint(ctx: &mut EvalContext, bytes: &[u8]) -> Result { } impl BinaryLiteral { - /// from_u64 creates a new BinaryLiteral instance by the given uint value in BigEndian. - /// byte size will be used as the length of the new BinaryLiteral, with leading bytes filled to zero. - /// If byte size is -1, the leading zeros in new BinaryLiteral will be trimmed. + /// from_u64 creates a new BinaryLiteral instance by the given uint value in + /// BigEndian. byte size will be used as the length of the new + /// BinaryLiteral, with leading bytes filled to zero. If byte size is -1, + /// the leading zeros in new BinaryLiteral will be trimmed. pub fn from_u64(val: u64, byte_size: isize) -> Result { if byte_size != -1 && !(1..=8).contains(&byte_size) { return Err(box_err!("invalid byte size: {}", byte_size)); @@ -276,7 +277,7 @@ mod tests { } let lit = BinaryLiteral::from_u64(100, -2); - assert!(lit.is_err()); + lit.unwrap_err(); } #[test] @@ -462,12 +463,10 @@ mod tests { let mut ctx = EvalContext::default(); for (s, expected, err) in cs { if err { - assert!( - BinaryLiteral::from_hex_str(s) - .unwrap() - .to_uint(&mut ctx) - .is_err() - ); + BinaryLiteral::from_hex_str(s) + .unwrap() + .to_uint(&mut ctx) + .unwrap_err(); } else { let lit = BinaryLiteral::from_hex_str(s).unwrap(); assert_eq!(lit.to_uint(&mut ctx).unwrap(), expected) diff --git a/components/tidb_query_datatype/src/codec/mysql/charset.rs b/components/tidb_query_datatype/src/codec/mysql/charset.rs index 27ad1b2a44f..0ac2655c619 100644 --- a/components/tidb_query_datatype/src/codec/mysql/charset.rs +++ b/components/tidb_query_datatype/src/codec/mysql/charset.rs @@ -4,7 +4,8 @@ pub const CHARSET_BIN: &str = "binary"; /// `CHARSET_UTF8` is the default charset for string types. pub const CHARSET_UTF8: &str = "utf8"; -/// `CHARSET_UTF8MB4` represents 4 bytes utf8, which works the same way as utf8 in Rust. +/// `CHARSET_UTF8MB4` represents 4 bytes utf8, which works the same way as utf8 +/// in Rust. pub const CHARSET_UTF8MB4: &str = "utf8mb4"; /// `CHARSET_ASCII` is a subset of UTF8. pub const CHARSET_ASCII: &str = "ascii"; diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 2eec85b7e34..143ec6c7760 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -58,10 +58,11 @@ impl Res { matches!(*self, Res::Truncated(_)) } - /// Convert `Res` into `Result` with an `EvalContext` that handling the errors - /// If `truncated_err` is None, `ctx` will try to handle the default truncated error: `Error::truncated()`, - /// otherwise handle the specified error inside `truncated_err`. - /// Same does `overflow_err` means. + /// Convert `Res` into `Result` with an `EvalContext` that handling the + /// errors If `truncated_err` is None, `ctx` will try to handle the + /// default truncated error: `Error::truncated()`, otherwise handle the + /// specified error inside `truncated_err`. Same does `overflow_err` + /// means. fn into_result_impl( self, ctx: &mut EvalContext, @@ -186,7 +187,8 @@ pub fn dec_encoded_len(encoded: &[u8]) -> Result { Ok(int_len + frac_len + 2) } -/// `count_leading_zeroes` returns the number of leading zeroes that can be removed from int. +/// `count_leading_zeroes` returns the number of leading zeroes that can be +/// removed from int. fn count_leading_zeroes(i: u8, word: u32) -> u8 { let (mut c, mut i) = (0, i as usize); while TEN_POW[i] > word { @@ -196,7 +198,8 @@ fn count_leading_zeroes(i: u8, word: u32) -> u8 { c } -/// `count_trailing_zeroes` returns the number of trailing zeroes that can be removed from fraction. +/// `count_trailing_zeroes` returns the number of trailing zeroes that can be +/// removed from fraction. fn count_trailing_zeroes(i: u8, word: u32) -> u8 { let (mut c, mut i) = (0, i as usize); while word % TEN_POW[i] == 0 { @@ -259,14 +262,15 @@ fn sub2(lhs: u32, rhs: u32, carry: &mut i32, res: &mut u32) { type SubTmp = (usize, usize, u8); -/// calculate the carry for lhs - rhs, returns the carry and needed temporary results for -/// beginning a subtraction. +/// calculate the carry for lhs - rhs, returns the carry and needed temporary +/// results for beginning a subtraction. /// /// The new carry can be: /// 1. None if lhs is equals to rhs. /// 2. Some(0) if abs(lhs) > abs(rhs), /// 3. Some(1) if abs(lhs) < abs(rhs). -/// l_frac_word_cnt and r_frac_word_cnt do not contain the suffix 0 when r_int_word_cnt == l_int_word_cnt. +/// l_frac_word_cnt and r_frac_word_cnt do not contain the suffix 0 when +/// r_int_word_cnt == l_int_word_cnt. #[inline] fn calc_sub_carry(lhs: &Decimal, rhs: &Decimal) -> (Option, u8, SubTmp, SubTmp) { let (l_int_word_cnt, mut l_frac_word_cnt) = (word_cnt!(lhs.int_cnt), word_cnt!(lhs.frac_cnt)); @@ -303,9 +307,11 @@ fn calc_sub_carry(lhs: &Decimal, rhs: &Decimal) -> (Option, u8, SubTmp, Sub while r_idx as isize <= r_end && rhs.word_buf[r_end as usize] == 0 { r_end -= 1; } - // here l_end is the last nonzero index in l.word_buf, attention:it may in the range of (0,l_int_word_cnt) + // here l_end is the last nonzero index in l.word_buf, attention:it may in the + // range of (0,l_int_word_cnt) l_frac_word_cnt = cmp::max(0, l_end + 1 - l_stop as isize) as u8; - // here r_end is the last nonzero index in r.word_buf, attention:it may in the range of (0,r_int_word_cnt) + // here r_end is the last nonzero index in r.word_buf, attention:it may in the + // range of (0,r_int_word_cnt) r_frac_word_cnt = cmp::max(0, r_end + 1 - r_stop as isize) as u8; while l_idx as isize <= l_end && r_idx as isize <= r_end @@ -367,11 +373,11 @@ fn do_sub<'a>(mut lhs: &'a Decimal, mut rhs: &'a Decimal) -> Res { } let mut carry = 0; let mut res = res.map(|_| Decimal::new(int_cnt, frac_cnt, negative)); - let mut l_idx = l_start + l_int_word_cnt as usize + l_frac_word_cnt as usize; - let mut r_idx = r_start + r_int_word_cnt as usize + r_frac_word_cnt as usize; + let mut l_idx = l_start + l_int_word_cnt + l_frac_word_cnt as usize; + let mut r_idx = r_start + r_int_word_cnt + r_frac_word_cnt as usize; // adjust `l_idx` and `r_idx` to the same position of digits after the point. if l_frac_word_cnt > r_frac_word_cnt { - let l_stop = l_start + l_int_word_cnt as usize + r_frac_word_cnt as usize; + let l_stop = l_start + l_int_word_cnt + r_frac_word_cnt as usize; if l_frac_word_cnt < frac_word_to { // It happens only when suffix 0 exist(3.10000000000-2.00). idx_to -= (frac_word_to - l_frac_word_cnt) as usize; @@ -382,7 +388,7 @@ fn do_sub<'a>(mut lhs: &'a Decimal, mut rhs: &'a Decimal) -> Res { res.word_buf[idx_to] = lhs.word_buf[l_idx]; } } else { - let r_stop = r_start + r_int_word_cnt as usize + l_frac_word_cnt as usize; + let r_stop = r_start + r_int_word_cnt + l_frac_word_cnt as usize; if frac_word_to > r_frac_word_cnt { // It happens only when suffix 0 exist(3.00-2.00000000000). idx_to -= (frac_word_to - r_frac_word_cnt) as usize; @@ -796,7 +802,7 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { word_cnt!(lhs.int_cnt + rhs.int_cnt) as usize, l_frac_word_cnt + r_frac_word_cnt, ); - let (mut old_int_word_to, mut old_frac_word_to) = (int_word_to as i32, frac_word_to as i32); + let (mut old_int_word_to, mut old_frac_word_to) = (int_word_to as i32, frac_word_to); let res = fix_word_cnt_err(int_word_to as u8, frac_word_to as u8, WORD_BUF_LEN); let (int_word_to, frac_word_to) = (res.0 as usize, res.1 as usize); let negative = lhs.negative != rhs.negative; @@ -976,10 +982,10 @@ impl Decimal { } /// Given a precision count 'prec', get: - /// 1. the index of first non-zero word in self.word_buf to hold the leading 'prec' number of - /// digits - /// 2. the number of remained digits if we remove all leading zeros for the leading 'prec' - /// number of digits + /// 1. the index of first non-zero word in self.word_buf to hold the + /// leading 'prec' number of digits + /// 2. the number of remained digits if we remove all leading zeros for the + /// leading 'prec' number of digits fn remove_leading_zeroes(&self, prec: u8) -> (usize, u8) { let mut cnt = prec; let mut i = ((cnt + DIGITS_PER_WORD - 1) % DIGITS_PER_WORD) + 1; @@ -1016,7 +1022,8 @@ impl Decimal { (buf, word_start_idx, int_len, int_cnt, frac_cnt) } - /// Get the least precision and fraction count to encode this decimal completely. + /// Get the least precision and fraction count to encode this decimal + /// completely. pub fn prec_and_frac(&self) -> (u8, u8) { let (_, int_cnt) = self.remove_leading_zeroes(self.int_cnt); let prec = int_cnt + self.frac_cnt; @@ -1338,8 +1345,9 @@ impl Decimal { dec } - /// `shift` shifts decimal digits in given number (with rounding if it need), - /// shift > 0 means shift to left shift, shift < 0 means right shift. + /// `shift` shifts decimal digits in given number (with rounding if it + /// need), shift > 0 means shift to left shift, shift < 0 means right + /// shift. /// /// In fact it is multiplying on 10^shift. pub fn shift(self, shift: isize) -> Res { @@ -1564,7 +1572,8 @@ impl Decimal { Decimal::from_bytes_with_word_buf(s, WORD_BUF_LEN) } - /// Returns a `Decimal` from a given bytes slice buffer and specified buffer length + /// Returns a `Decimal` from a given bytes slice buffer and specified buffer + /// length /// /// # Notes /// @@ -1574,7 +1583,7 @@ impl Decimal { fn from_bytes_with_word_buf(s: &[u8], word_buf_len: u8) -> Result> { // trim whitespace let mut bs = match s.iter().position(|c| !c.is_ascii_whitespace()) { - //TODO: return badnumber + // TODO: return badnumber None => return Err(box_err!("\"{}\" is empty", escape(s))), Some(pos) => &s[pos..], }; @@ -1614,11 +1623,11 @@ impl Decimal { let mut inner_idx = 0; let mut word_idx = int_word_cnt as usize; let mut word = 0; - for c in bs[int_idx - int_cnt as usize..int_idx].iter().rev() { + for c in bs[int_idx - int_cnt..int_idx].iter().rev() { word += u32::from(c - b'0') * TEN_POW[inner_idx]; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { - //TODO overflow + // TODO overflow word_idx -= 1; d.word_buf[word_idx] = word; word = 0; @@ -1633,7 +1642,7 @@ impl Decimal { word_idx = int_word_cnt as usize; word = 0; inner_idx = 0; - for &c in bs.iter().skip(int_idx + 1).take(frac_cnt as usize) { + for &c in bs.iter().skip(int_idx + 1).take(frac_cnt) { word = u32::from(c - b'0') + word * 10; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { @@ -1934,7 +1943,7 @@ impl Display for Decimal { } } -impl crate::codec::data_type::AsMySQLBool for Decimal { +impl crate::codec::data_type::AsMySqlBool for Decimal { #[inline] fn as_mysql_bool(&self, _ctx: &mut EvalContext) -> crate::codec::Result { Ok(!self.is_zero()) @@ -2245,7 +2254,8 @@ pub trait DecimalDecoder: NumberDecoder { Ok(d) } - /// `read_decimal_from_chunk` decode Decimal encoded by `write_decimal_to_chunk`. + /// `read_decimal_from_chunk` decode Decimal encoded by + /// `write_decimal_to_chunk`. fn read_decimal_from_chunk(&mut self) -> Result { let buf = self.read_bytes(DECIMAL_STRUCT_SIZE)?; let d = unsafe { @@ -2379,7 +2389,7 @@ impl Hash for Decimal { while idx < stop && self.word_buf[idx] == 0 { idx += 1; } - let start = idx as usize; + let start = idx; let int_word_cnt = stop - idx; int_word_cnt.hash(state); @@ -2457,12 +2467,15 @@ mod tests { Ok(Decimal::from_str("-18446744073709552000").unwrap()), ), // FIXME: because of rust's bug, - // (1<<64)(18446744073709551616), (1<<65)(36893488147419103232) can not be represent by f64 - // so these cases can not pass + // (1<<64)(18446744073709551616), (1<<65)(36893488147419103232) can not be represent + // by f64 so these cases can not pass // (18446744073709551616.0, Ok(Decimal::from_str("18446744073709551616").unwrap())), // (-18446744073709551616.0, Ok(Decimal::from_str("-18446744073709551616").unwrap())), // (36893488147419103000.0, Ok(Decimal::from_str("36893488147419103000.0").unwrap())), - // (-36893488147419103000.0, Ok(Decimal::from_str("-36893488147419103000.0").unwrap())), + // ( + // -36893488147419103000.0, + // Ok(Decimal::from_str("-36893488147419103000.0").unwrap()) + // ), ( 36893488147419103000.0, Ok(Decimal::from_str("36893488147419103000.0").unwrap()), @@ -3032,7 +3045,7 @@ mod tests { // error cases let cases = vec![b"1e18446744073709551620"]; for case in cases { - assert!(Decimal::from_bytes(case).is_err()); + Decimal::from_bytes(case).unwrap_err(); } } @@ -3721,11 +3734,9 @@ mod tests { ))); let truncated_res = Res::Truncated(2333); - assert!( - truncated_res - .into_result_impl(&mut ctx, Some(Error::truncated()), None) - .is_ok() - ); + truncated_res + .into_result_impl(&mut ctx, Some(Error::truncated()), None) + .unwrap(); // Overflow cases let mut ctx = EvalContext::default(); @@ -3744,10 +3755,8 @@ mod tests { Flag::OVERFLOW_AS_WARNING, ))); let error = Error::overflow("", ""); - assert!( - overflow_res - .into_result_impl(&mut ctx, None, Some(error)) - .is_ok() - ); + overflow_res + .into_result_impl(&mut ctx, None, Some(error)) + .unwrap(); } } diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 997983c2e49..7279f788146 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -81,7 +81,7 @@ fn check_nanos_part(nanos: u32) -> Result { #[inline] fn check_nanos(nanos: i64) -> Result { - if nanos < -MAX_NANOS || nanos > MAX_NANOS { + if !(-MAX_NANOS..=MAX_NANOS).contains(&nanos) { Err(Error::truncated_wrong_val("NANOS", nanos)) } else { Ok(nanos) @@ -150,28 +150,35 @@ mod parser { Ok((rest, hhmmss)) } - fn hhmmss_datetime<'a>( - ctx: &mut EvalContext, - input: &'a str, - fsp: u8, - ) -> IResult<&'a str, Duration, ()> { + /// A string can match datetime format only if it starts with a series of + /// digits whose length matches the full format of DateTime literal (12, + /// 14) or the string starts with a date literal. + fn format_can_match_datetime(input: &str) -> IResult<(), (), ()> { let (rest, digits) = digit1(input)?; + if digits.len() == 12 || digits.len() == 14 { - let datetime = DateTime::parse_datetime(ctx, input, fsp as i8, true) - .map_err(|_| nom::Err::Error(()))?; - return Ok(("", datetime.convert(ctx).map_err(|_| nom::Err::Error(()))?)); + return Ok(((), ())); } + let (rest, _) = anysep(rest)?; let (rest, _) = digit1(rest)?; let (rest, _) = anysep(rest)?; let (rest, _) = digit1(rest)?; - let has_datetime_sep = matches!(rest.chars().next(), Some(c) if c == 'T' || c == ' '); - - if !has_datetime_sep { - return Err(nom::Err::Error(())); + if matches!(rest.chars().next(), Some(c) if c == 'T' || c == ' ') { + Ok(((), ())) + } else { + Err(nom::Err::Error(())) } + } + /// Caller should make sure the input string can match datetime format + /// according to `format_can_match_datetime`. + fn hhmmss_datetime<'a>( + ctx: &mut EvalContext, + input: &'a str, + fsp: u8, + ) -> IResult<&'a str, Duration, ()> { let datetime = DateTime::parse_datetime(ctx, input, fsp as i8, true) .map_err(|_| nom::Err::Error(()))?; Ok(("", datetime.convert(ctx).map_err(|_| nom::Err::Error(()))?)) @@ -208,16 +215,21 @@ mod parser { ctx: &mut EvalContext, input: &str, fsp: u8, - fallback_to_daytime: bool, + fallback_to_datetime: bool, overflow_as_null: bool, ) -> Option { let input = input.trim(); if input.is_empty() { - return Some(Duration::zero()); + return None; } let (rest, neg) = negative(input).ok()?; let (rest, _) = space0::<_, ()>(rest).ok()?; + + let chars_len = rest.len(); + let mut truncated_parse = false; + let fallback_to_datetime = fallback_to_datetime && format_can_match_datetime(rest).is_ok(); + let duration = day_hhmmss(rest) .ok() .and_then(|(rest, (day, [hh, mm, ss]))| { @@ -230,7 +242,10 @@ mod parser { let (rest, frac) = fraction(rest, fsp).ok()?; if !rest.is_empty() { - return None; + if chars_len >= 12 { + return None; + } + truncated_parse = true; } Some(Duration::new_from_parts( @@ -238,8 +253,18 @@ mod parser { )) }); + // In order to keep compatible with TiDB, when input string can only be + // partially parsed by `hhmmss_compact` and it can match the datetime + // format, we fallback to parse it using datetime format. + if truncated_parse && fallback_to_datetime { + return hhmmss_datetime(ctx, rest, fsp).map_or(None, |(_, duration)| Some(duration)); + } + match duration { - Some(Ok(duration)) => Some(duration), + Some(Ok(duration)) => { + let _ = ctx.handle_truncate(truncated_parse); + Some(duration) + } Some(Err(err)) if err.is_overflow() => { if overflow_as_null { return None; @@ -249,7 +274,7 @@ mod parser { Some(Duration { nanos, fsp }) }) } - None if fallback_to_daytime => { + None if fallback_to_datetime => { hhmmss_datetime(ctx, rest, fsp).map_or(None, |(_, duration)| Some(duration)) } _ => None, @@ -339,7 +364,8 @@ impl Duration { } /// Returns the number of seconds contained by this Duration as f64. - /// The returned value does include the fractional (nanosecond) part of the duration. + /// The returned value does include the fractional (nanosecond) part of the + /// duration. #[inline] pub fn to_secs_f64(self) -> f64 { self.nanos as f64 / NANOS_PER_SEC as f64 @@ -483,7 +509,8 @@ impl Duration { Ok(Duration { nanos, fsp }) } - /// Checked duration addition. Computes self + rhs, returning None if overflow occurred. + /// Checked duration addition. Computes self + rhs, returning None if + /// overflow occurred. pub fn checked_add(self, rhs: Duration) -> Option { let nanos = self.nanos.checked_add(rhs.nanos)?; check_nanos(nanos).ok()?; @@ -493,7 +520,8 @@ impl Duration { }) } - /// Checked duration subtraction. Computes self - rhs, returning None if overflow occurred. + /// Checked duration subtraction. Computes self - rhs, returning None if + /// overflow occurred. pub fn checked_sub(self, rhs: Duration) -> Option { let nanos = self.nanos.checked_sub(rhs.nanos)?; check_nanos(nanos).ok()?; @@ -675,7 +703,7 @@ pub trait DurationDecoder: NumberDecoder { impl DurationDecoder for T {} -impl crate::codec::data_type::AsMySQLBool for Duration { +impl crate::codec::data_type::AsMySqlBool for Duration { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { Ok(!self.is_zero()) @@ -809,7 +837,8 @@ mod tests { ("2011-11-11 00:00:01", 0, Some("00:00:01")), ("20111111000001", 0, Some("00:00:01")), ("201112110102", 0, Some("11:01:02")), - ("2011-11-11", 0, None), + ("2011-11-11", 0, Some("00:20:11")), + ("2012-08-x", 0, Some("00:20:12")), ("--23", 0, None), ("232 10", 0, None), ("-232 10", 0, None), @@ -818,7 +847,24 @@ mod tests { ("00:00:00.777777", 2, Some("00:00:00.78")), ("00:00:00.777777", 6, Some("00:00:00.777777")), ("00:00:00.001", 3, Some("00:00:00.001")), + ("0x", 6, Some("00:00:00.000000")), + ("1x", 6, Some("00:00:01.000000")), + ("0000-00-00", 6, Some("00:00:00.000000")), // NOTE: The following case is easy to fail. + ("0000-00-00", 0, Some("00:00:00")), + ("1234abc", 0, Some("00:12:34")), + ("1234x", 0, Some("00:12:34")), + ("1234xxxxxxx", 0, Some("00:12:34")), + ("1234xxxxxxxx", 0, None), + ("-1234xxxxxxx", 0, Some("-00:12:34")), + ("-1234xxxxxxxx", 0, None), + ("1-----", 0, Some("00:00:01")), + ("20100000-02-12", 0, None), + ("20100-02-12", 0, Some("02:01:00")), + ("99999-99-99", 0, None), + ("99990000", 0, None), + ("0000-00-00", 0, Some("00:00:00")), + ("00-00-00", 0, Some("00:00:00")), ("- 1 ", 0, Some("-00:00:01")), ("1:2:3", 0, Some("01:02:03")), ("1 1:2:3", 0, Some("25:02:03")), @@ -835,8 +881,9 @@ mod tests { (" - 1 : 2 : 3 .123 ", 3, Some("-01:02:03.123")), (" - 1 .123 ", 3, Some("-00:00:01.123")), ("-", 0, None), + ("a", 0, None), ("- .1", 0, None), - ("", 0, Some("00:00:00")), + ("", 0, None), ("", 7, None), ("1.1", 1, Some("00:00:01.1")), ("-1.1", 1, Some("-00:00:01.1")), @@ -846,13 +893,13 @@ mod tests { ("4294967295 0:59:59", 0, None), ("4294967295 232:59:59", 0, None), ("-4294967295 232:59:59", 0, None), - ("1::2:3", 0, None), - ("1.23 3", 0, None), + ("1::2:3", 0, Some("00:00:01")), + ("1.23 3", 0, Some("00:00:01")), ("1:62:3", 0, None), ("1:02:63", 0, None), ("-231342080", 0, None), + ("2010-02-12", 0, Some("00:20:10")), // test fallback to datetime - ("2010-02-12", 0, None), ("2010-02-12t12:23:34", 0, None), ("2010-02-12T12:23:34", 0, Some("12:23:34")), ("2010-02-12 12:23:34", 0, Some("12:23:34")), @@ -871,6 +918,7 @@ mod tests { let cases: Vec<(&str, i8, Option<&'static str>, bool)> = vec![ ("-790822912", 0, None, true), ("-790822912", 0, Some("-838:59:59"), false), + ("99990000", 0, Some("838:59:59"), false), ]; for (input, fsp, expect, return_null) in cases { @@ -1022,7 +1070,7 @@ mod tests { #[test] fn test_checked_add_and_sub_duration() { /// `MAX_TIME_IN_SECS` is the maximum for mysql time type. - const MAX_TIME_IN_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR as i64 + const MAX_TIME_IN_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR + MAX_MINUTE_PART as i64 * SECS_PER_MINUTE + MAX_SECOND_PART as i64; @@ -1062,7 +1110,7 @@ mod tests { // UNSPECIFIED_FSP ( 8385959, - UNSPECIFIED_FSP as i8, + UNSPECIFIED_FSP, Ok(Duration::parse(&mut EvalContext::default(), "838:59:59", 0).unwrap()), false, ), diff --git a/components/tidb_query_datatype/src/codec/mysql/enums.rs b/components/tidb_query_datatype/src/codec/mysql/enums.rs index 9a591cf750a..6c39d7f8a95 100644 --- a/components/tidb_query_datatype/src/codec/mysql/enums.rs +++ b/components/tidb_query_datatype/src/codec/mysql/enums.rs @@ -84,7 +84,7 @@ impl PartialOrd for Enum { } } -impl crate::codec::data_type::AsMySQLBool for Enum { +impl crate::codec::data_type::AsMySqlBool for Enum { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { Ok(self.value != 0) @@ -467,7 +467,7 @@ mod tests { 1, 0, 0, 0, 0, 0, 0, 0, 99, // 3rd ]; for data in &src { - dest.write_enum_to_chunk_by_datum_payload_compact_bytes(*data, &field_type) + dest.write_enum_to_chunk_by_datum_payload_compact_bytes(data, &field_type) .expect("write_enum_to_chunk_by_payload_compact_bytes"); } assert_eq!(&dest, res); @@ -490,7 +490,7 @@ mod tests { 1, 0, 0, 0, 0, 0, 0, 0, 99, // 3rd ]; for data in &src { - dest.write_enum_to_chunk_by_datum_payload_uint(*data, &field_type) + dest.write_enum_to_chunk_by_datum_payload_uint(data, &field_type) .expect("write_enum_to_chunk_by_payload_uint"); } assert_eq!(&dest, res); @@ -513,7 +513,7 @@ mod tests { 1, 0, 0, 0, 0, 0, 0, 0, 99, // 3rd ]; for data in &src { - dest.write_enum_to_chunk_by_datum_payload_var_uint(*data, &field_type) + dest.write_enum_to_chunk_by_datum_payload_var_uint(data, &field_type) .expect("write_enum_to_chunk_by_payload_var_uint"); } assert_eq!(&dest, res); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index af66980460e..c965247b8da 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -5,9 +5,29 @@ use std::convert::TryInto; use codec::number::NumberCodec; use super::{constants::*, JsonRef, JsonType, ERR_CONVERT_FAILED}; -use crate::codec::Result; +use crate::codec::{mysql::json::path_expr::ArrayIndex, Result}; impl<'a> JsonRef<'a> { + /// Gets the index from the ArrayIndex + /// + /// If the idx is greater than the count and is from right, it will return + /// `None` + /// + /// See `jsonPathArrayIndex.getIndexFromStart()` in TiDB + /// `types/json_path_expr.go` + pub fn array_get_index(&self, idx: ArrayIndex) -> Option { + match idx { + ArrayIndex::Left(idx) => Some(idx as usize), + ArrayIndex::Right(idx) => { + if self.get_elem_count() < 1 + (idx as usize) { + None + } else { + Some(self.get_elem_count() - 1 - (idx as usize)) + } + } + } + } + /// Gets the ith element in JsonRef /// /// See `arrayGetElem()` in TiDB `json/binary.go` @@ -17,7 +37,7 @@ impl<'a> JsonRef<'a> { /// Return the `i`th key in current Object json /// - /// See `arrayGetElem()` in TiDB `json/binary.go` + /// See `objectGetKey()` in TiDB `types/json_binary.go` pub fn object_get_key(&self, i: usize) -> &'a [u8] { let key_off_start = HEADER_LEN + i * KEY_ENTRY_LEN; let key_off = NumberCodec::decode_u32_le(&self.value()[key_off_start..]) as usize; @@ -28,7 +48,7 @@ impl<'a> JsonRef<'a> { /// Returns the JsonRef of `i`th value in current Object json /// - /// See `arrayGetElem()` in TiDB `json/binary.go` + /// See `objectGetVal()` in TiDB `types/json_binary.go` pub fn object_get_val(&self, i: usize) -> Result> { let ele_count = self.get_elem_count(); let val_entry_off = HEADER_LEN + ele_count * KEY_ENTRY_LEN + i * VALUE_ENTRY_LEN; @@ -62,7 +82,7 @@ impl<'a> JsonRef<'a> { pub fn val_entry_get(&self, val_entry_off: usize) -> Result> { let val_type: JsonType = self.value()[val_entry_off].try_into()?; let val_offset = - NumberCodec::decode_u32_le(&self.value()[val_entry_off + TYPE_LEN as usize..]) as usize; + NumberCodec::decode_u32_le(&self.value()[val_entry_off + TYPE_LEN..]) as usize; Ok(match val_type { JsonType::Literal => { let offset = val_entry_off + TYPE_LEN; @@ -80,6 +100,21 @@ impl<'a> JsonRef<'a> { &self.value()[val_offset..val_offset + str_len as usize + len_len], ) } + JsonType::Opaque => { + let (opaque_bytes_len, len_len) = + NumberCodec::try_decode_var_u64(&self.value()[val_offset + 1..])?; + JsonRef::new( + val_type, + &self.value()[val_offset..val_offset + opaque_bytes_len as usize + len_len + 1], + ) + } + JsonType::Date | JsonType::Datetime | JsonType::Timestamp => { + JsonRef::new(val_type, &self.value()[val_offset..val_offset + TIME_LEN]) + } + JsonType::Time => JsonRef::new( + val_type, + &self.value()[val_offset..val_offset + DURATION_LEN], + ), _ => { let data_size = NumberCodec::decode_u32_le(&self.value()[val_offset + ELEMENT_COUNT_LEN..]) @@ -114,7 +149,16 @@ impl<'a> JsonRef<'a> { #[cfg(test)] mod tests { - use super::{super::Json, *}; + use std::collections::BTreeMap; + + use super::*; + use crate::{ + codec::{ + data_type::Duration, + mysql::{Json, Time, TimeType}, + }, + expr::EvalContext, + }; #[test] fn test_type() { @@ -135,4 +179,235 @@ mod tests { assert_eq!(json.as_ref().get_type(), tp, "{:?}", json_str); } } + + #[test] + fn test_array_get_elem() { + let mut ctx = EvalContext::default(); + + let time = Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(); + let duration = Duration::parse(&mut ctx, "12:13:14", 0).unwrap(); + let array = vec![ + Json::from_u64(1).unwrap(), + Json::from_str_val("abcdefg").unwrap(), + ]; + let object = BTreeMap::from([ + ("key1".to_string(), Json::from_u64(1).unwrap()), + ("key2".to_string(), Json::from_str_val("abcdefg").unwrap()), + ]); + + let json_array = Json::from_array(vec![ + Json::from_u64(1).unwrap(), + Json::from_time(time).unwrap(), + Json::from_duration(duration).unwrap(), + Json::from_array(array).unwrap(), + Json::from_str_val("abcdefg").unwrap(), + Json::from_bool(false).unwrap(), + Json::from_object(object).unwrap(), + ]) + .unwrap(); + let json_array_ref = json_array.as_ref(); + + assert_eq!(json_array_ref.array_get_elem(0).unwrap().get_u64(), 1); + assert_eq!( + json_array_ref + .array_get_elem(1) + .unwrap() + .get_time() + .unwrap(), + time + ); + assert_eq!( + json_array_ref + .array_get_elem(2) + .unwrap() + .get_duration() + .unwrap(), + duration + ); + assert_eq!( + json_array_ref + .array_get_elem(3) + .unwrap() + .array_get_elem(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_array_ref + .array_get_elem(3) + .unwrap() + .array_get_elem(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + assert_eq!( + json_array_ref.array_get_elem(4).unwrap().get_str().unwrap(), + "abcdefg" + ); + assert_eq!( + json_array_ref + .array_get_elem(5) + .unwrap() + .get_literal() + .unwrap(), + false + ); + assert_eq!( + json_array_ref.array_get_elem(6).unwrap().object_get_key(0), + b"key1" + ); + assert_eq!( + json_array_ref.array_get_elem(6).unwrap().object_get_key(1), + b"key2" + ); + assert_eq!( + json_array_ref + .array_get_elem(6) + .unwrap() + .object_get_val(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_array_ref + .array_get_elem(6) + .unwrap() + .object_get_val(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + } + + #[test] + fn test_object_get_val() { + let mut ctx = EvalContext::default(); + + let time = Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(); + let duration = Duration::parse(&mut ctx, "12:13:14", 0).unwrap(); + let array = vec![ + Json::from_u64(1).unwrap(), + Json::from_str_val("abcdefg").unwrap(), + ]; + let object = BTreeMap::from([ + ("key1".to_string(), Json::from_u64(1).unwrap()), + ("key2".to_string(), Json::from_str_val("abcdefg").unwrap()), + ]); + + let json_object = Json::from_object(BTreeMap::from([ + ("0".to_string(), Json::from_u64(1).unwrap()), + ("1".to_string(), Json::from_time(time).unwrap()), + ("2".to_string(), Json::from_duration(duration).unwrap()), + ("3".to_string(), Json::from_array(array).unwrap()), + ("4".to_string(), Json::from_str_val("abcdefg").unwrap()), + ("5".to_string(), Json::from_bool(false).unwrap()), + ("6".to_string(), Json::from_object(object).unwrap()), + ])) + .unwrap(); + let json_object_ref = json_object.as_ref(); + + assert_eq!(json_object_ref.object_get_key(0), b"0"); + assert_eq!(json_object_ref.object_get_key(1), b"1"); + assert_eq!(json_object_ref.object_get_key(2), b"2"); + assert_eq!(json_object_ref.object_get_key(3), b"3"); + + assert_eq!(json_object_ref.object_get_val(0).unwrap().get_u64(), 1); + assert_eq!( + json_object_ref + .object_get_val(1) + .unwrap() + .get_time() + .unwrap(), + time + ); + assert_eq!( + json_object_ref + .object_get_val(2) + .unwrap() + .get_duration() + .unwrap(), + duration + ); + assert_eq!( + json_object_ref + .object_get_val(3) + .unwrap() + .array_get_elem(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_object_ref + .object_get_val(3) + .unwrap() + .array_get_elem(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + assert_eq!( + json_object_ref + .object_get_val(4) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + assert_eq!( + json_object_ref + .object_get_val(5) + .unwrap() + .get_literal() + .unwrap(), + false + ); + assert_eq!( + json_object_ref.object_get_val(6).unwrap().object_get_key(0), + b"key1" + ); + assert_eq!( + json_object_ref.object_get_val(6).unwrap().object_get_key(1), + b"key2" + ); + assert_eq!( + json_object_ref + .object_get_val(6) + .unwrap() + .object_get_val(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_object_ref + .object_get_val(6) + .unwrap() + .object_get_val(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs index 1cad179b475..d9104385bc6 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs @@ -37,6 +37,11 @@ impl<'a> JsonRef<'a> { .map_or(PRECEDENCE_NULL, |_| PRECEDENCE_BOOLEAN), JsonType::I64 | JsonType::U64 | JsonType::Double => PRECEDENCE_NUMBER, JsonType::String => PRECEDENCE_STRING, + JsonType::Opaque => PRECEDENCE_OPAQUE, + JsonType::Date => PRECEDENCE_DATE, + JsonType::Datetime => PRECEDENCE_DATETIME, + JsonType::Timestamp => PRECEDENCE_DATETIME, + JsonType::Time => PRECEDENCE_TIME, } } @@ -140,17 +145,35 @@ impl<'a> PartialOrd for JsonRef<'a> { } Some(left_count.cmp(&right_count)) } + JsonType::Opaque => { + if let (Ok(left), Ok(right)) = + (self.get_opaque_bytes(), right.get_opaque_bytes()) + { + left.partial_cmp(right) + } else { + return None; + } + } + JsonType::Date | JsonType::Datetime | JsonType::Timestamp => { + // The jsonTypePrecedences guarantees that the DATE is only comparable with the + // DATE, and the DATETIME and TIMESTAMP will compare with + // each other + if let (Ok(left), Ok(right)) = (self.get_time(), right.get_time()) { + left.partial_cmp(&right) + } else { + return None; + } + } + JsonType::Time => { + if let (Ok(left), Ok(right)) = (self.get_duration(), right.get_duration()) { + left.partial_cmp(&right) + } else { + return None; + } + } }; } - let left_data = self.as_f64(); - let right_data = right.as_f64(); - // tidb treats boolean as integer, but boolean is different from integer in JSON. - // so we need convert them to same type and then compare. - if let (Ok(left), Ok(right)) = (left_data, right_data) { - return left.partial_cmp(&right); - } - if precedence_diff > 0 { Some(Ordering::Greater) } else { @@ -181,6 +204,13 @@ impl PartialOrd for Json { #[cfg(test)] mod tests { use super::*; + use crate::{ + codec::{ + data_type::Duration, + mysql::{Time, TimeType}, + }, + expr::EvalContext, + }; #[test] fn test_cmp_json_numberic_type() { @@ -268,8 +298,8 @@ mod tests { let test_cases = vec![ ("1.5", "2"), ("1.5", "false"), - ("true", "1.5"), - ("true", "2"), + ("1.5", "true"), + ("2", "true"), ("null", r#"{"a": "b"}"#), ("2", r#""hello, world""#), (r#""hello, world""#, r#"{"a": "b"}"#), @@ -282,7 +312,121 @@ mod tests { let right: Json = right_str.parse().unwrap(); assert!(left < right); } + } + + #[test] + fn test_cmp_json_between_json_type() { + let mut ctx = EvalContext::default(); + + let cmp = [ + ( + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-14 13:14:15", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Ordering::Less, + ), + ( + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-12 13:14:15", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Ordering::Greater, + ), + ( + // DateTime is always greater than Date + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Json::from_time( + Time::parse(&mut ctx, "1998-06-14", TimeType::Date, 0, false).unwrap(), + ) + .unwrap(), + Ordering::Greater, + ), + ( + Json::from_duration(Duration::parse(&mut ctx, "12:13:14", 0).unwrap()).unwrap(), + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Ordering::Less, + ), + ( + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Json::from_duration(Duration::parse(&mut ctx, "12:13:14", 0).unwrap()).unwrap(), + Ordering::Greater, + ), + ( + // Time is always greater than Date + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Json::from_time( + Time::parse(&mut ctx, "1998-06-12", TimeType::Date, 0, false).unwrap(), + ) + .unwrap(), + Ordering::Greater, + ), + ( + // Time is always less than DateTime + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-12 11:11:11", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Ordering::Less, + ), + ]; - assert_eq!(Json::from_i64(2).unwrap(), Json::from_bool(false).unwrap()); + for (l, r, result) in cmp { + assert_eq!(l.cmp(&r), result) + } } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/constants.rs b/components/tidb_query_datatype/src/codec/mysql/json/constants.rs index 57927b4b99c..7dec22a6c0b 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/constants.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/constants.rs @@ -11,6 +11,8 @@ pub const LITERAL_LEN: usize = 1; pub const U16_LEN: usize = 2; pub const U32_LEN: usize = 4; pub const NUMBER_LEN: usize = 8; +pub const TIME_LEN: usize = NUMBER_LEN; +pub const DURATION_LEN: usize = NUMBER_LEN + U32_LEN; pub const HEADER_LEN: usize = ELEMENT_COUNT_LEN + SIZE_LEN; // element size + data size pub const KEY_OFFSET_LEN: usize = U32_LEN; pub const KEY_LEN_LEN: usize = U16_LEN; diff --git a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs index 4e4094f0ae3..867d8ec2c20 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs @@ -5,7 +5,10 @@ use std::{collections::BTreeMap, convert::TryInto, f64, str}; use codec::{number::NumberCodec, prelude::*}; use super::{constants::*, Json, JsonRef, JsonType}; -use crate::codec::{Error, Result}; +use crate::{ + codec::{Error, Result}, + FieldTypeTp, +}; impl<'a> JsonRef<'a> { fn encoded_len(&self) -> usize { @@ -211,6 +214,14 @@ pub trait JsonEncoder: NumberEncoder { self.write_bytes(bytes)?; Ok(()) } + + fn write_json_opaque(&mut self, typ: FieldTypeTp, bytes: &[u8]) -> Result<()> { + self.write_u8(typ.to_u8().unwrap())?; + let bytes_len = bytes.len() as u64; + self.write_var_u64(bytes_len)?; + self.write_bytes(bytes)?; + Ok(()) + } } pub trait JsonDatumPayloadChunkEncoder: BufferWriter { @@ -243,6 +254,16 @@ pub trait JsonDecoder: NumberDecoder { } JsonType::I64 | JsonType::U64 | JsonType::Double => self.read_bytes(NUMBER_LEN)?, JsonType::Literal => self.read_bytes(LITERAL_LEN)?, + JsonType::Opaque => { + let value = self.bytes(); + // the first byte of opaque stores the MySQL type code + let (opaque_bytes_len, len_len) = NumberCodec::try_decode_var_u64(&value[1..])?; + self.read_bytes(opaque_bytes_len as usize + len_len + 1)? + } + JsonType::Date | JsonType::Datetime | JsonType::Timestamp => { + self.read_bytes(TIME_LEN)? + } + JsonType::Time => self.read_bytes(DURATION_LEN)?, }; Ok(Json::new(tp, Vec::from(value))) } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs new file mode 100644 index 00000000000..46de1af9e0b --- /dev/null +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs @@ -0,0 +1,106 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use super::{super::Result, JsonRef, JsonType}; + +impl<'a> JsonRef<'a> { + /// `json_contains` is the implementation for JSON_CONTAINS in mysql + /// + /// See `ContainsBinaryJSON()` in TiDB `types/json_binary_functions.go` + pub fn json_contains(&self, target: JsonRef<'_>) -> Result { + match self.type_code { + JsonType::Object => { + if target.type_code == JsonType::Object { + let elem_count = target.get_elem_count(); + for i in 0..elem_count { + let key = target.object_get_key(i); + let val = target.object_get_val(i)?; + let idx = self.object_search_key(key); + match idx { + None => { + return Ok(false); + } + Some(idx) => { + let exp = self.object_get_val(idx)?; + if !(exp.json_contains(val)?) { + return Ok(false); + } + } + } + } + return Ok(true); + } + } + JsonType::Array => { + if target.type_code == JsonType::Array { + let elem_count = target.get_elem_count(); + for i in 0..elem_count { + if !(self.json_contains(target.array_get_elem(i)?)?) { + return Ok(false); + } + } + return Ok(true); + } + let elem_count = self.get_elem_count(); + for i in 0..elem_count { + if self.array_get_elem(i)?.json_contains(target)? { + return Ok(true); + } + } + } + _ => { + return match self.partial_cmp(&target).unwrap() { + Ordering::Equal => Ok(true), + _ => Ok(false), + }; + } + }; + Ok(false) + } +} + +#[cfg(test)] +mod tests { + use super::super::Json; + #[test] + fn test_json_contains() { + let mut test_cases = vec![ + (r#"{"a":{"a":1},"b":2}"#, r#"{"b":2}"#, true), + (r#"{}"#, r#"{}"#, true), + (r#"{"a":1}"#, r#"{}"#, true), + (r#"{"a":1}"#, r#"1"#, false), + (r#"{"a":[1]}"#, r#"[1]"#, false), + (r#"{"b":2, "c":3}"#, r#"{"c":3}"#, true), + (r#"1"#, r#"1"#, true), + (r#"[1]"#, r#"1"#, true), + (r#"[1,2]"#, r#"[1]"#, true), + (r#"[1,2]"#, r#"[1,3]"#, false), + (r#"[1,2]"#, r#"["1"]"#, false), + (r#"[1,2,[1,3]]"#, r#"[1,3]"#, true), + (r#"[1,2,[1,[5,[3]]]]"#, r#"[1,3]"#, true), + (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[1,{"a":[3]}]"#, true), + (r#"[{"a":1}]"#, r#"{"a":1}"#, true), + (r#"[{"a":1,"b":2}]"#, r#"{"a":1}"#, true), + (r#"[{"a":{"a":1},"b":2}]"#, r#"{"a":1}"#, false), + (r#"{"a":{"a":1},"b":2}"#, r#"{"b":3}"#, false), + (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[1,{"a":[3]}]"#, true), + (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[10,{"a":[3]}]"#, false), + ]; + for (i, (js, value, expected)) in test_cases.drain(..).enumerate() { + let j = js.parse(); + assert!(j.is_ok(), "#{} expect parse ok but got {:?}", i, j); + let j: Json = j.unwrap(); + let value = value.parse(); + assert!(value.is_ok(), "#{} expect parse ok but got {:?}", i, j); + let value: Json = value.unwrap(); + + let got = j.as_ref().json_contains(value.as_ref()).unwrap(); + assert_eq!( + got, expected, + "#{} expect {:?}, but got {:?}", + i, expected, got + ); + } + } +} diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs index bc867904fd6..7e619e74c32 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs @@ -1,33 +1,78 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +use collections::HashSet; + use super::{ super::Result, - path_expr::{PathExpression, PathLeg, PATH_EXPR_ARRAY_INDEX_ASTERISK, PATH_EXPR_ASTERISK}, + path_expr::{PathExpression, PathLeg}, Json, JsonRef, JsonType, }; +use crate::codec::mysql::json::path_expr::{ArrayIndex, ArraySelection, KeySelection}; impl<'a> JsonRef<'a> { - /// `extract` receives several path expressions as arguments, matches them in j, and returns - /// the target JSON matched any path expressions, which may be autowrapped as an array. - /// If there is no any expression matched, it returns None. + /// `extract` receives several path expressions as arguments, matches them + /// in j, and returns the target JSON matched any path expressions, which + /// may be autowrapped as an array. If there is no any expression matched, + /// it returns None. /// /// See `Extract()` in TiDB `json.binary_function.go` pub fn extract(&self, path_expr_list: &[PathExpression]) -> Result> { + let mut could_return_multiple_matches = path_expr_list.len() > 1; + let mut elem_list = Vec::with_capacity(path_expr_list.len()); for path_expr in path_expr_list { + could_return_multiple_matches |= path_expr.contains_any_asterisk(); + could_return_multiple_matches |= path_expr.contains_any_range(); + elem_list.append(&mut extract_json(*self, &path_expr.legs)?) } + if elem_list.is_empty() { - return Ok(None); + Ok(None) + } else if could_return_multiple_matches { + Ok(Some(Json::from_array( + elem_list.drain(..).map(|j| j.to_owned()).collect(), + )?)) + } else { + Ok(Some(elem_list.remove(0).to_owned())) } - if path_expr_list.len() == 1 && elem_list.len() == 1 { - // If path_expr contains asterisks, elem_list.len() won't be 1 - // even if path_expr_list.len() equals to 1. - return Ok(Some(elem_list.remove(0).to_owned())); + } +} + +#[derive(Eq)] +struct RefEqualJsonWrapper<'a>(JsonRef<'a>); + +impl<'a> PartialEq for RefEqualJsonWrapper<'a> { + fn eq(&self, other: &Self) -> bool { + self.0.ref_eq(&other.0) + } +} + +impl<'a> std::hash::Hash for RefEqualJsonWrapper<'a> { + fn hash(&self, state: &mut H) { + self.0.value.as_ptr().hash(state) + } +} + +// append the elem_list vector, if the referenced json object doesn't exist +// unlike the append in std, this function **doesn't** set the `other` length to +// 0 +// +// To use this function, you have to ensure both `elem_list` and `other` are +// unique. +fn append_if_ref_unique<'a>(elem_list: &mut Vec>, other: &Vec>) { + elem_list.reserve(other.len()); + + let mut unique_verifier = HashSet::>::with_hasher(Default::default()); + for elem in elem_list.iter() { + unique_verifier.insert(RefEqualJsonWrapper(*elem)); + } + + for elem in other { + let elem = RefEqualJsonWrapper(*elem); + if !unique_verifier.contains(&elem) { + elem_list.push(elem.0); } - Ok(Some(Json::from_array( - elem_list.drain(..).map(|j| j.to_owned()).collect(), - )?)) } } @@ -38,53 +83,108 @@ pub fn extract_json<'a>(j: JsonRef<'a>, path_legs: &[PathLeg]) -> Result match j.get_type() { + match current_leg { + PathLeg::ArraySelection(selection) => match j.get_type() { JsonType::Array => { let elem_count = j.get_elem_count(); - if i == PATH_EXPR_ARRAY_INDEX_ASTERISK { - for k in 0..elem_count { - ret.append(&mut extract_json(j.array_get_elem(k)?, sub_path_legs)?) + match selection { + ArraySelection::Asterisk => { + for k in 0..elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(k)?, sub_path_legs)?, + ) + } + } + ArraySelection::Index(index) => { + if let Some(index) = j.array_get_index(*index) { + if index < elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(index)?, sub_path_legs)?, + ) + } + } + } + ArraySelection::Range(start, end) => { + if let (Some(start), Some(mut end)) = + (j.array_get_index(*start), j.array_get_index(*end)) + { + if end >= elem_count { + end = elem_count - 1 + } + if start <= end { + for i in start..=end { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(i)?, sub_path_legs)?, + ) + } + } + } } - } else if (i as usize) < elem_count { - ret.append(&mut extract_json( - j.array_get_elem(i as usize)?, - sub_path_legs, - )?) } } _ => { - if i as usize == 0 { - ret.append(&mut extract_json(j, sub_path_legs)?) + // If the current object is not an array, still append them if the selection + // includes 0. But for asterisk, it still returns NULL. + // + // as the element is not array, don't use `array_get_index` + match selection { + ArraySelection::Index(ArrayIndex::Left(0)) => { + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + } + ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Right(0) | ArrayIndex::Left(_), + ) => { + // for [0 to Non-negative Number] and [0 to last], it extracts itself + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + } + _ => {} } } }, - PathLeg::Key(ref key) => { + PathLeg::Key(key) => { if j.get_type() == JsonType::Object { - if key == PATH_EXPR_ASTERISK { - let elem_count = j.get_elem_count(); - for i in 0..elem_count { - ret.append(&mut extract_json(j.object_get_val(i)?, sub_path_legs)?) + match key { + KeySelection::Asterisk => { + let elem_count = j.get_elem_count(); + for i in 0..elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.object_get_val(i)?, sub_path_legs)?, + ) + } + } + KeySelection::Key(key) => { + if let Some(idx) = j.object_search_key(key.as_bytes()) { + let val = j.object_get_val(idx)?; + append_if_ref_unique(&mut ret, &extract_json(val, sub_path_legs)?) + } } - } else if let Some(idx) = j.object_search_key(key.as_bytes()) { - let val = j.object_get_val(idx)?; - ret.append(&mut extract_json(val, sub_path_legs)?) } } } PathLeg::DoubleAsterisk => { - ret.append(&mut extract_json(j, sub_path_legs)?); + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?); match j.get_type() { JsonType::Array => { let elem_count = j.get_elem_count(); for k in 0..elem_count { - ret.append(&mut extract_json(j.array_get_elem(k)?, sub_path_legs)?) + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(k)?, path_legs)?, + ) } } JsonType::Object => { let elem_count = j.get_elem_count(); for i in 0..elem_count { - ret.append(&mut extract_json(j.object_get_val(i)?, sub_path_legs)?) + append_if_ref_unique( + &mut ret, + &extract_json(j.object_get_val(i)?, path_legs)?, + ) } } _ => {} @@ -101,10 +201,15 @@ mod tests { use super::{ super::path_expr::{ PathExpressionFlag, PATH_EXPRESSION_CONTAINS_ASTERISK, - PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, PATH_EXPR_ARRAY_INDEX_ASTERISK, + PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, *, }; + use crate::codec::mysql::json::path_expr::{ArrayIndex, PATH_EXPRESSION_CONTAINS_RANGE}; + + fn select_from_left(index: usize) -> PathLeg { + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Left(index as u32))) + } #[test] fn test_json_extract() { @@ -115,7 +220,7 @@ mod tests { ( "[true, 2017]", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("true"), @@ -123,7 +228,7 @@ mod tests { ( "[true, 2017]", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PATH_EXPRESSION_CONTAINS_ASTERISK, }], Some("[true, 2017]"), @@ -131,7 +236,7 @@ mod tests { ( "[true, 2107]", vec![PathExpression { - legs: vec![PathLeg::Index(2)], + legs: vec![select_from_left(2)], flags: PathExpressionFlag::default(), }], None, @@ -139,7 +244,7 @@ mod tests { ( "6.18", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("6.18"), @@ -147,7 +252,7 @@ mod tests { ( "6.18", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -155,7 +260,7 @@ mod tests { ( "true", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("true"), @@ -163,7 +268,7 @@ mod tests { ( "true", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -171,7 +276,7 @@ mod tests { ( "6", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("6"), @@ -179,7 +284,7 @@ mod tests { ( "6", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -187,7 +292,7 @@ mod tests { ( "-6", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("-6"), @@ -195,7 +300,7 @@ mod tests { ( "-6", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -203,7 +308,7 @@ mod tests { ( r#"{"a": [1, 2, {"aa": "xx"}]}"#, vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -211,7 +316,7 @@ mod tests { ( r#"{"a": [1, 2, {"aa": "xx"}]}"#, vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some(r#"{"a": [1, 2, {"aa": "xx"}]}"#), @@ -220,7 +325,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("c"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("c")))], flags: PathExpressionFlag::default(), }], Some("false"), @@ -228,7 +333,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from(PATH_EXPR_ASTERISK))], + legs: vec![PathLeg::Key(KeySelection::Asterisk)], flags: PATH_EXPRESSION_CONTAINS_ASTERISK, }], Some(r#"["a1", 20.08, false]"#), @@ -236,7 +341,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("d"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("d")))], flags: PathExpressionFlag::default(), }], None, @@ -245,7 +350,10 @@ mod tests { ( "21", vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], None, @@ -253,18 +361,252 @@ mod tests { ( r#"{"g": {"a": "a1", "b": 20.08, "c": false}}"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], - Some("false"), + Some("[false]"), ), ( r#"[{"a": "a1", "b": 20.08, "c": false}, true]"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], - Some("false"), + Some("[false]"), + ), + ( + r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, + vec![PathExpression { + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some("[[0, 1], 0, 1, 2, 3, 4, 5, 6]"), + ), + ( + r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, + vec![ + PathExpression { + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }, + PathExpression { + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }, + ], + Some("[[0, 1], 0, 1, 2, 3, 4, 5, 6, [0, 1], 0, 1, 2, 3, 4, 5, 6]"), + ), + ( + "[1]", + vec![PathExpression { + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some("[1]"), + ), + ( + r#"{"a": 1}"#, + vec![PathExpression { + legs: vec![ + PathLeg::Key(KeySelection::Key(String::from("a"))), + select_from_left(0), + ], + flags: PathExpressionFlag::default(), + }], + Some("1"), + ), + ( + r#"{"a": 1}"#, + vec![PathExpression { + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some(r#"[{"a": 1}, 1]"#), + ), + ( + r#"{"a": 1}"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + select_from_left(0), + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], + flags: PathExpressionFlag::default(), + }], + Some(r#"1"#), + ), + ( + r#"[1, [[{"x": [{"a":{"b":{"c":42}}}]}]]]"#, + vec![PathExpression { + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::Key(KeySelection::Asterisk), + ], + flags: PATH_EXPRESSION_CONTAINS_ASTERISK + | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some(r#"[{"c": 42}]"#), + ), + ( + r#"[{"a": [3,4]}, {"b": 2 }]"#, + vec![ + PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], + flags: PathExpressionFlag::default(), + }, + PathExpression { + legs: vec![ + select_from_left(1), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], + flags: PathExpressionFlag::default(), + }, + ], + Some("[[3, 4]]"), + ), + ( + r#"[{"a": [1,1,1,1]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], + flags: PathExpressionFlag::default(), + }], + Some("[1, 1, 1, 1]"), + ), + ( + r#"[1,2,3,4]"#, + vec![PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(2), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(0))), + ], + flags: PathExpressionFlag::default(), + }], + Some("4"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(1))), + ], + flags: PathExpressionFlag::default(), + }], + Some("3"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(100))), + ], + flags: PathExpressionFlag::default(), + }], + None, + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(100), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + None, + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(100), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[1,2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Left(2), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[1,2,3]"), ), ]; for (i, (js, exprs, expected)) in test_cases.drain(..).enumerate() { @@ -275,11 +617,15 @@ mod tests { Some(es) => { let e = Json::from_str(es); assert!(e.is_ok(), "#{} expect parse json ok but got {:?}", i, e); - Some(e.unwrap()) + Some(e.unwrap().to_string()) } None => None, }; - let got = j.as_ref().extract(&exprs[..]).unwrap(); + let got = j + .as_ref() + .extract(&exprs[..]) + .unwrap() + .map(|got| got.to_string()); assert_eq!( got, expected, "#{} expect {:?}, but got {:?}", diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs index 96bc9aaf56e..68c361321ad 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs @@ -5,7 +5,8 @@ use std::str; use super::{super::Result, path_expr::PathExpression, Json, JsonRef, JsonType}; impl<'a> JsonRef<'a> { - /// Evaluates a (possibly empty) list of values and returns a JSON array containing those values specified by `path_expr_list` + /// Evaluates a (possibly empty) list of values and returns a JSON array + /// containing those values specified by `path_expr_list` pub fn keys(&self, path_expr_list: &[PathExpression]) -> Result> { if !path_expr_list.is_empty() { if path_expr_list.len() > 1 { diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs index 3bccdce7017..627daf77722 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs @@ -13,7 +13,8 @@ impl Json { /// 1. adjacent arrays are merged to a single array; /// 2. adjacent object are merged to a single object; /// 3. a scalar value is autowrapped as an array before merge; - /// 4. an adjacent array and object are merged by autowrapping the object as an array. + /// 4. an adjacent array and object are merged by autowrapping the object as + /// an array. /// /// See `MergeBinary()` in TiDB `json/binary_function.go` #[allow(clippy::comparison_chain)] diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs index e8c709e9571..b359158d06b 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs @@ -33,7 +33,7 @@ impl<'a> JsonRef<'a> { )); } for expr in path_expr_list { - if expr.contains_any_asterisk() { + if expr.contains_any_asterisk() || expr.contains_any_range() { return Err(box_err!( "Invalid path expression: expected no asterisk, found {:?}", expr diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs index a350df91b06..bcb6fd01716 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs @@ -7,10 +7,9 @@ impl<'a> JsonRef<'a> { /// All path expressions cannot contain * or ** wildcard. /// If any error occurs, the input won't be changed. pub fn remove(&self, path_expr_list: &[PathExpression]) -> Result { - if path_expr_list - .iter() - .any(|expr| expr.legs.is_empty() || expr.contains_any_asterisk()) - { + if path_expr_list.iter().any(|expr| { + expr.legs.is_empty() || expr.contains_any_asterisk() || expr.contains_any_range() + }) { return Err(box_err!("Invalid path expression")); } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs index c6fd25ec688..70321080ef7 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use super::{JsonRef, JsonType}; +use crate::FieldTypeTp; const JSON_TYPE_BOOLEAN: &[u8] = b"BOOLEAN"; const JSON_TYPE_NONE: &[u8] = b"NULL"; @@ -10,6 +11,12 @@ const JSON_TYPE_DOUBLE: &[u8] = b"DOUBLE"; const JSON_TYPE_STRING: &[u8] = b"STRING"; const JSON_TYPE_OBJECT: &[u8] = b"OBJECT"; const JSON_TYPE_ARRAY: &[u8] = b"ARRAY"; +const JSON_TYPE_BIT: &[u8] = b"BIT"; +const JSON_TYPE_BLOB: &[u8] = b"BLOB"; +const JSON_TYPE_OPAQUE: &[u8] = b"OPAQUE"; +const JSON_TYPE_DATE: &[u8] = b"DATE"; +const JSON_TYPE_DATETIME: &[u8] = b"DATETIME"; +const JSON_TYPE_TIME: &[u8] = b"TIME"; impl<'a> JsonRef<'a> { /// `json_type` is the implementation for @@ -26,6 +33,23 @@ impl<'a> JsonRef<'a> { Some(_) => JSON_TYPE_BOOLEAN, None => JSON_TYPE_NONE, }, + JsonType::Opaque => match self.get_opaque_type() { + Ok( + FieldTypeTp::TinyBlob + | FieldTypeTp::MediumBlob + | FieldTypeTp::LongBlob + | FieldTypeTp::Blob + | FieldTypeTp::String + | FieldTypeTp::VarString + | FieldTypeTp::VarChar, + ) => JSON_TYPE_BLOB, + Ok(FieldTypeTp::Bit) => JSON_TYPE_BIT, + _ => JSON_TYPE_OPAQUE, + }, + JsonType::Date => JSON_TYPE_DATE, + JsonType::Datetime => JSON_TYPE_DATETIME, + JsonType::Timestamp => JSON_TYPE_DATETIME, + JsonType::Time => JSON_TYPE_TIME, } } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs index 5cfc8bc908d..f95c08cf958 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs @@ -24,6 +24,16 @@ impl<'a> JsonRef<'a> { let s = self.get_str()?; unquote_string(s) } + JsonType::Date + | JsonType::Datetime + | JsonType::Timestamp + | JsonType::Time + | JsonType::Opaque => { + let s = self.to_string(); + // Remove the quotes of output + assert!(s.len() > 2); + Ok(s[1..s.len() - 1].to_string()) + } _ => Ok(self.to_string()), } } @@ -83,6 +93,13 @@ mod tests { use std::collections::BTreeMap; use super::{super::Json, *}; + use crate::{ + codec::{ + data_type::Duration, + mysql::{Time, TimeType}, + }, + expr::EvalContext, + }; #[test] fn test_decode_escaped_unicode() { @@ -161,4 +178,29 @@ mod tests { ); } } + + #[test] + fn test_json_unquote_time_duration() { + let mut ctx = EvalContext::default(); + + let time = Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(); + assert_eq!( + time.as_ref().unquote().unwrap(), + "1998-06-13 12:13:14.000000" + ); + + let duration = + Json::from_duration(Duration::parse(&mut ctx, "12:13:14", 0).unwrap()).unwrap(); + assert_eq!(duration.as_ref().unquote().unwrap(), "12:13:14.000000"); + } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index 2b36a4b89d0..0cd382f6d65 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -54,7 +54,6 @@ //! // lengths up to 127, 2 bytes to represent //! // lengths up to 16383, and so on... //! ``` -//! mod binary; mod comparison; @@ -66,6 +65,7 @@ mod modifier; mod path_expr; mod serde; // json functions +mod json_contains; mod json_depth; mod json_extract; mod json_keys; @@ -76,7 +76,11 @@ mod json_remove; mod json_type; pub mod json_unquote; -use std::{collections::BTreeMap, convert::TryFrom, str}; +use std::{ + collections::BTreeMap, + convert::{TryFrom, TryInto}, + str, +}; use codec::number::{NumberCodec, F64_SIZE, I64_SIZE}; use constants::{JSON_LITERAL_FALSE, JSON_LITERAL_NIL, JSON_LITERAL_TRUE}; @@ -91,17 +95,17 @@ use super::super::{datum::Datum, Error, Result}; use crate::{ codec::{ convert::ConvertTo, - data_type::{Decimal, Real}, - mysql, + data_type::{BytesRef, Decimal, Real}, mysql::{Duration, Time, TimeType}, }, expr::EvalContext, + FieldTypeTp, }; const ERR_CONVERT_FAILED: &str = "Can not covert from "; /// The types of `Json` which follows -#[derive(Eq, PartialEq, FromPrimitive, Clone, Debug, Copy)] +#[derive(PartialEq, FromPrimitive, Clone, Debug, Copy)] pub enum JsonType { Object = 0x01, Array = 0x03, @@ -110,6 +114,14 @@ pub enum JsonType { U64 = 0x0a, Double = 0x0b, String = 0x0c, + + // It's a special value for the compatibility with MySQL. + // It will store the raw buffer containing unexpected type (e.g. Binary). + Opaque = 0x0d, + Date = 0x0e, + Datetime = 0x0f, + Timestamp = 0x10, + Time = 0x11, } impl TryFrom for JsonType { @@ -207,18 +219,70 @@ impl<'a> JsonRef<'a> { Ok(str::from_utf8(self.get_str_bytes()?)?) } + // Returns the opaque value in bytes + pub(crate) fn get_opaque_bytes(&self) -> Result<&'a [u8]> { + assert_eq!(self.type_code, JsonType::Opaque); + let val = self.value(); + let (str_len, len_len) = NumberCodec::try_decode_var_u64(&val[1..])?; + Ok(&val[(len_len + 1)..len_len + 1 + str_len as usize]) + } + + pub(crate) fn get_opaque_type(&self) -> Result { + assert_eq!(self.type_code, JsonType::Opaque); + let val = self.value(); + FieldTypeTp::from_u8(val[0]).ok_or(box_err!("invalid opaque type code")) + } + + pub fn get_time(&self) -> Result